Fix merge issues when incorporating new AF calculations changes

This commit is contained in:
Guillermo del Angel 2012-03-27 15:00:44 -04:00
commit 343a061b1c
26 changed files with 881 additions and 614 deletions

View File

@ -1,4 +1,4 @@
Copyright (c) 2011 The Broad Institute
Copyright (c) 2012 The Broad Institute
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation

View File

@ -26,14 +26,12 @@ package org.broadinstitute.sting.gatk.report;
import org.apache.commons.lang.math.NumberUtils;
import java.util.Arrays;
import java.util.Collection;
import java.util.TreeMap;
import java.util.*;
/**
* Holds values for a column in a GATK report table
*/
public class GATKReportColumn extends TreeMap<Object, Object> {
public class GATKReportColumn extends LinkedHashMap<Object, Object> {
final private String columnName;
final private Object defaultValue;
final private String format;

View File

@ -756,10 +756,6 @@ public class GATKReportTable {
*/
// Get the column widths for everything
HashMap<String, GATKReportColumnFormat> columnFormats = new HashMap<String, GATKReportColumnFormat>();
for (String columnName : columns.keySet()) {
columnFormats.put(columnName, columns.get(columnName).getColumnFormat());
}
String primaryKeyFormat = "%-" + getPrimaryKeyColumnWidth() + "s";
// Emit the table definition
@ -787,7 +783,7 @@ public class GATKReportTable {
if (needsPadding) {
out.printf(" ");
}
out.printf(columnFormats.get(columnName).getNameFormat(), columnName);
out.printf(columns.get(columnName).getColumnFormat().getNameFormat(), columnName);
needsPadding = true;
}
@ -796,29 +792,31 @@ public class GATKReportTable {
out.printf("%n");
// Emit the table body
for (Object primaryKey : primaryKeyColumn) {
for (final Object primaryKey : primaryKeyColumn) {
needsPadding = false;
if (primaryKeyDisplay) {
out.printf(primaryKeyFormat, primaryKey);
needsPadding = true;
}
for (String columnName : columns.keySet()) {
if (columns.get(columnName).isDisplayable()) {
for (final Map.Entry<String, GATKReportColumn> entry : columns.entrySet()) {
final GATKReportColumn column = entry.getValue();
if (column.isDisplayable()) {
if (needsPadding) {
out.printf(" ");
out.print(" ");
}
String value = columns.get(columnName).getStringValue(primaryKey);
out.printf(columnFormats.get(columnName).getValueFormat(), value);
final String value = column.getStringValue(primaryKey);
out.printf(column.getColumnFormat().getValueFormat(), value);
needsPadding = true;
}
}
out.printf("%n");
out.println();
}
out.printf("%n");
out.println();
}
public int getNumRows() {

View File

@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -68,6 +69,9 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
@Argument(fullName="showIndelPileups",shortName="show_indels",doc="In addition to base pileups, generate pileups of extended indel events")
public boolean SHOW_INDEL_PILEUPS = false;
@Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output")
public boolean SHOW_VERBOSE = false;
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
@ -82,7 +86,10 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
if ( context.hasBasePileup() ) {
ReadBackedPileup basePileup = context.getBasePileup();
out.printf("%s %s%n", basePileup.getPileupString(ref.getBaseAsChar()), rods);
out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods);
if ( SHOW_VERBOSE )
out.printf(" %s", createVerboseOutput(basePileup));
out.println();
}
if ( context.hasExtendedEventPileup() ) {
@ -125,6 +132,24 @@ public class PileupWalker extends LocusWalker<Integer, Integer> implements TreeR
return rodString;
}
private static String createVerboseOutput(final ReadBackedPileup pileup) {
final StringBuilder sb = new StringBuilder();
boolean isFirst = true;
for ( PileupElement p : pileup ) {
if ( isFirst )
isFirst = false;
else
sb.append(",");
sb.append(p.getRead().getReadName());
sb.append(":");
sb.append(p.getOffset());
sb.append(":");
sb.append(p.getRead().getReadLength());
}
return sb.toString();
}
@Override
public void onTraversalDone(Integer result) {

View File

@ -91,6 +91,25 @@ public class ErrorRatePerCycle extends LocusWalker<Integer, Integer> {
this.cycle = cycle;
}
// Must overload hashCode and equals to properly work with GATKReportColumn
@Override
public int hashCode() {
return readGroup.hashCode() + 33 * cycle;
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final TableKey oKey = (TableKey) o;
if ( cycle != oKey.cycle ) return false;
if ( !readGroup.equals(oKey.readGroup) ) return false;
return true;
}
@Override
public int compareTo(final TableKey tableKey) {
final int scmp = readGroup.compareTo(tableKey.readGroup);

View File

@ -70,6 +70,6 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
* @return the alleles used for genotyping
*/
protected abstract List<Allele> getLog10PNonRef(final VariantContext vc,
final double[][] log10AlleleFrequencyPriors,
final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result);
}

View File

@ -25,6 +25,11 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: ebanks
@ -34,23 +39,54 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
*/
public class AlleleFrequencyCalculationResult {
// IMPORTANT NOTE:
// These 2 arrays are intended to contain the likelihoods/posterior probabilities for each alternate allele over each possible frequency (from 0 to 2N).
// For any given alternate allele and frequency, the likelihoods are marginalized over values for all other alternate alleles. What this means is that
// the likelihoods at cell index zero (AF=0) in the array is actually that of the site's being polymorphic (because although this alternate allele may
// be at AF=0, it is marginalized over all other alternate alleles which are not necessarily at AF=0).
// In the bi-allelic case (where there are no other alternate alleles over which to marginalize),
// the value at cell index zero will be equal to AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED.
final double[][] log10AlleleFrequencyLikelihoods;
final double[][] log10AlleleFrequencyPosteriors;
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
final private int[] alleleCountsOfMLE;
final private int[] alleleCountsOfMAP;
// These 2 variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
double log10LikelihoodOfAFzero = 0.0;
double log10PosteriorOfAFzero = 0.0;
// The posteriors seen, not including that of AF=0
// TODO -- better implementation needed here (see below)
private ArrayList<Double> log10PosteriorMatrixValues = new ArrayList<Double>(100000);
private Double log10PosteriorMatrixSum = null;
public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
private double log10LikelihoodOfAFzero;
private double log10PosteriorOfAFzero;
public AlleleFrequencyCalculationResult(final int maxAltAlleles) {
alleleCountsOfMLE = new int[maxAltAlleles];
alleleCountsOfMAP = new int[maxAltAlleles];
reset();
}
public double getLog10MLE() {
return log10MLE;
}
public double getLog10MAP() {
return log10MAP;
}
public double getLog10PosteriorMatrixSum() {
if ( log10PosteriorMatrixSum == null ) {
// TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory;
// TODO -- will discuss with the team what the best option is
final double[] tmp = new double[log10PosteriorMatrixValues.size()];
for ( int i = 0; i < tmp.length; i++ )
tmp[i] = log10PosteriorMatrixValues.get(i);
log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp);
}
return log10PosteriorMatrixSum;
}
public int[] getAlleleCountsOfMLE() {
return alleleCountsOfMLE;
}
public int[] getAlleleCountsOfMAP() {
return alleleCountsOfMAP;
}
public double getLog10LikelihoodOfAFzero() {
@ -60,4 +96,47 @@ public class AlleleFrequencyCalculationResult {
public double getLog10PosteriorOfAFzero() {
return log10PosteriorOfAFzero;
}
public void reset() {
log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
log10PosteriorMatrixValues.clear();
log10PosteriorMatrixSum = null;
}
public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) {
if ( log10LofK > log10MLE ) {
log10MLE = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMLE[i] = alleleCountsForK[i];
}
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
log10PosteriorMatrixValues.add(log10LofK);
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
alleleCountsOfMAP[i] = alleleCountsForK[i];
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {
log10MLE = log10LikelihoodOfAFzero;
Arrays.fill(alleleCountsOfMLE, 0);
}
}
public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) {
this.log10PosteriorOfAFzero = log10PosteriorOfAFzero;
if ( log10PosteriorOfAFzero > log10MAP ) {
log10MAP = log10PosteriorOfAFzero;
Arrays.fill(alleleCountsOfMAP, 0);
}
}
}

View File

@ -0,0 +1,286 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
* Code for determining which indels are segregating among the samples.
*
* This code is just a refactor of the original code from Guillermo in the UG.
*
* @author Mark DePristo
* @since 3/26/12
*/
public class ConsensusAlleleCounter {
final protected static Logger logger = Logger.getLogger(ConsensusAlleleCounter.class);
private final int minIndelCountForGenotyping;
private final boolean doMultiAllelicCalls;
private final double minFractionInOneSample;
private final GenomeLocParser locParser;
public ConsensusAlleleCounter(final GenomeLocParser locParser,
final boolean doMultiAllelicCalls,
final int minIndelCountForGenotyping,
final double minFractionInOneSample) {
this.minIndelCountForGenotyping = minIndelCountForGenotyping;
this.doMultiAllelicCalls = doMultiAllelicCalls;
this.minFractionInOneSample = minFractionInOneSample;
this.locParser = locParser;
}
/**
* Returns a list of Alleles at this locus that may be segregating
*
* @param ref
* @param contexts
* @param contextType
* @return
*/
public List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final Map<String, Integer> consensusIndelStrings = countConsensusAlleles(ref, contexts, contextType);
// logger.info("Alleles at " + ref.getLocus());
// for ( Map.Entry<String, Integer> elt : consensusIndelStrings.entrySet() ) {
// logger.info(" " + elt.getValue() + " => " + elt.getKey());
// }
return consensusCountsToAlleles(ref, consensusIndelStrings);
}
//
// TODO -- WARNING DOESN'T WORK WITH REDUCED READS
//
private Map<String, Integer> countConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType) {
final GenomeLoc loc = ref.getLocus();
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
int insCount = 0, delCount = 0;
// quick check of total number of indels in pileup
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
insCount += indelPileup.getNumberOfInsertions();
delCount += indelPileup.getNumberOfDeletions();
}
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
return Collections.emptyMap();
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
final int nIndelReads = indelPileup.getNumberOfInsertions() + indelPileup.getNumberOfDeletions();
final int nReadsOverall = indelPileup.getNumberOfElements();
if ( nIndelReads == 0 || (nIndelReads / (1.0 * nReadsOverall)) < minFractionInOneSample) {
// if ( nIndelReads > 0 )
// logger.info("Skipping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
continue;
// } else {
// logger.info("### Keeping sample " + sample.getKey() + " with nIndelReads " + nIndelReads + " nReads " + nReadsOverall);
}
for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) {
final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if (ReadUtils.is454Read(read)) {
continue;
}
/* if (DEBUG && p.isIndel()) {
System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
p.getEventLength(),p.getType().toString(), p.getEventBases());
}
*/
String indelString = p.getEventBases();
if (p.isInsertion()) {
boolean foundKey = false;
// copy of hashmap into temp arrayList
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
for (String s : consensusIndelStrings.keySet()) {
cList.add(new Pair<String, Integer>(s,consensusIndelStrings.get(s)));
}
if (read.getAlignmentEnd() == loc.getStart()) {
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
// In this case, the read could have any of the inserted bases and we need to build a consensus
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
if (s.startsWith(indelString)) {
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
// opposite corner condition: read will start at current locus with an insertion
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
if (s.endsWith(indelString)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
cList.add(new Pair<String, Integer>(indelString,cnt+1));
}
// copy back arrayList into hashMap
consensusIndelStrings.clear();
for (Pair<String,Integer> pair : cList) {
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
}
}
else if (p.isDeletion()) {
indelString = String.format("D%d",p.getEventLength());
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
consensusIndelStrings.put(indelString,cnt+1);
}
}
}
return consensusIndelStrings;
}
private List<Allele> consensusCountsToAlleles(final ReferenceContext ref,
final Map<String, Integer> consensusIndelStrings) {
final GenomeLoc loc = ref.getLocus();
final Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
Allele refAllele, altAllele;
for (final Map.Entry<String, Integer> elt : consensusIndelStrings.entrySet()) {
final String s = elt.getKey();
final int curCnt = elt.getValue();
int stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
final int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
final int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
stop = loc.getStart() + dLen;
final byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases, true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
else continue; // don't go on with this allele if refBases are non-standard
} else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
else continue; // go on to next allele if consensus insertion has any non-standard base.
}
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(Arrays.asList(refAllele, altAllele));
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls)
vcs.add(builder.make());
else {
if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
}
}
if (vcs.isEmpty())
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
final VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
return mergedVC.getAlleles();
}
}

View File

@ -52,11 +52,9 @@ import java.util.*;
public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
private final int HAPLOTYPE_SIZE;
private final int minIndelCountForGenotyping;
private final boolean getAlleleListFromVCF;
private boolean DEBUG = false;
private final boolean doMultiAllelicCalls = true;
private boolean ignoreSNPAllelesWhenGenotypingIndels = false;
private PairHMMIndelErrorModel pairModel;
@ -72,217 +70,30 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
// gdebug removeme
// todo -cleanup
private GenomeLoc lastSiteVisited;
private ArrayList<Allele> alleleList;
private List<Allele> alleleList = new ArrayList<Allele>();
static {
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
}
public IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
alleleList = new ArrayList<Allele>();
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
HAPLOTYPE_SIZE = UAC.INDEL_HAPLOTYPE_SIZE;
DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO;
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES;
}
public static ArrayList<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType, GenomeLocParser locParser,
int minIndelCountForGenotyping, boolean doMultiAllelicCalls) {
Allele refAllele = null, altAllele = null;
GenomeLoc loc = ref.getLocus();
ArrayList<Allele> aList = new ArrayList<Allele>();
HashMap<String, Integer> consensusIndelStrings = new HashMap<String, Integer>();
int insCount = 0, delCount = 0;
// quick check of total number of indels in pileup
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
insCount += indelPileup.getNumberOfInsertions();
delCount += indelPileup.getNumberOfDeletions();
}
if (insCount < minIndelCountForGenotyping && delCount < minIndelCountForGenotyping)
return aList;
for (Map.Entry<String, AlignmentContext> sample : contexts.entrySet()) {
// todo -- warning, can be duplicating expensive partition here
AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType);
final ReadBackedExtendedEventPileup indelPileup = context.getExtendedEventPileup();
for (ExtendedEventPileupElement p : indelPileup.toExtendedIterable()) {
//SAMRecord read = p.getRead();
GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(p.getRead());
if (read == null)
continue;
if (ReadUtils.is454Read(read)) {
continue;
}
/* if (DEBUG && p.isIndel()) {
System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n",
read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(),
p.getEventLength(),p.getType().toString(), p.getEventBases());
}
*/
String indelString = p.getEventBases();
if (p.isInsertion()) {
boolean foundKey = false;
// copy of hashmap into temp arrayList
ArrayList<Pair<String,Integer>> cList = new ArrayList<Pair<String,Integer>>();
for (String s : consensusIndelStrings.keySet()) {
cList.add(new Pair<String, Integer>(s,consensusIndelStrings.get(s)));
}
if (read.getAlignmentEnd() == loc.getStart()) {
// first corner condition: a read has an insertion at the end, and we're right at the insertion.
// In this case, the read could have any of the inserted bases and we need to build a consensus
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
// case 1: current insertion is prefix of indel in hash map
if (s.startsWith(indelString)) {
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.startsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else if (read.getAlignmentStart() == loc.getStart()+1) {
// opposite corner condition: read will start at current locus with an insertion
for (int k=0; k < cList.size(); k++) {
String s = cList.get(k).getFirst();
int cnt = cList.get(k).getSecond();
if (s.endsWith(indelString)) {
// case 1: current insertion (indelString) is suffix of indel in hash map (s)
cList.set(k,new Pair<String, Integer>(s,cnt+1));
foundKey = true;
}
else if (indelString.endsWith(s)) {
// case 2: indel stored in hash table is prefix of current insertion
// In this case, new bases are new key.
foundKey = true;
cList.set(k,new Pair<String, Integer>(indelString,cnt+1));
}
}
if (!foundKey)
// none of the above: event bases not supported by previous table, so add new key
cList.add(new Pair<String, Integer>(indelString,1));
}
else {
// normal case: insertion somewhere in the middle of a read: add count to arrayList
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
cList.add(new Pair<String, Integer>(indelString,cnt+1));
}
// copy back arrayList into hashMap
consensusIndelStrings.clear();
for (Pair<String,Integer> pair : cList) {
consensusIndelStrings.put(pair.getFirst(),pair.getSecond());
}
}
else if (p.isDeletion()) {
indelString = String.format("D%d",p.getEventLength());
int cnt = consensusIndelStrings.containsKey(indelString)? consensusIndelStrings.get(indelString):0;
consensusIndelStrings.put(indelString,cnt+1);
}
}
}
Collection<VariantContext> vcs = new ArrayList<VariantContext>();
int maxAlleleCnt = 0;
String bestAltAllele = "";
for (String s : consensusIndelStrings.keySet()) {
int curCnt = consensusIndelStrings.get(s), stop = 0;
// if observed count if above minimum threshold, we will genotype this allele
if (curCnt < minIndelCountForGenotyping)
continue;
if (s.startsWith("D")) {
// get deletion length
int dLen = Integer.valueOf(s.substring(1));
// get ref bases of accurate deletion
int startIdxInReference = 1 + loc.getStart() - ref.getWindow().getStart();
stop = loc.getStart() + dLen;
byte[] refBases = Arrays.copyOfRange(ref.getBases(), startIdxInReference, startIdxInReference + dLen);
if (Allele.acceptableAlleleBases(refBases)) {
refAllele = Allele.create(refBases, true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
}
else continue; // don't go on with this allele if refBases are non-standard
} else {
// insertion case
if (Allele.acceptableAlleleBases(s)) {
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(s, false);
stop = loc.getStart();
}
else continue; // go on to next allele if consensus insertion has any non-standard base.
}
ArrayList vcAlleles = new ArrayList<Allele>();
vcAlleles.add(refAllele);
vcAlleles.add(altAllele);
final VariantContextBuilder builder = new VariantContextBuilder().source("");
builder.loc(loc.getContig(), loc.getStart(), stop);
builder.alleles(vcAlleles);
builder.referenceBaseForIndel(ref.getBase());
builder.noGenotypes();
if (doMultiAllelicCalls)
vcs.add(builder.make());
else {
if (curCnt > maxAlleleCnt) {
maxAlleleCnt = curCnt;
vcs.clear();
vcs.add(builder.make());
}
}
}
if (vcs.isEmpty())
return aList; // nothing else to do, no alleles passed minimum count criterion
VariantContext mergedVC = VariantContextUtils.simpleMerge(locParser, vcs, null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
aList = new ArrayList<Allele>(mergedVC.getAlleles());
return aList;
private List<Allele> computeConsensusAlleles(ReferenceContext ref,
Map<String, AlignmentContext> contexts,
AlignmentContextUtils.ReadOrientation contextType,
GenomeLocParser locParser) {
ConsensusAlleleCounter counter = new ConsensusAlleleCounter(locParser, true, UAC.MIN_INDEL_COUNT_FOR_GENOTYPING, UAC.MIN_INDEL_FRACTION_PER_SAMPLE);
return counter.computeConsensusAlleles(ref, contexts, contextType);
}
private final static EnumSet<VariantContext.Type> allowableTypes = EnumSet.of(VariantContext.Type.INDEL, VariantContext.Type.MIXED);
@ -338,7 +149,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
}
} else {
alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser, minIndelCountForGenotyping,doMultiAllelicCalls);
alleleList = computeConsensusAlleles(ref, contexts, contextType, locParser);
if (alleleList.isEmpty())
return null;
}

View File

@ -1,209 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.apache.commons.lang.NotImplementedException;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.security.cert.CertificateNotYetValidException;
import java.util.*;
import org.broadinstitute.sting.utils.codecs.vcf.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 8/30/11
* Time: 10:08 AM
* To change this template use File | Settings | File Templates.
*/
public class UGBoundAF extends RodWalker<VariantContext,Integer> {
@Output(shortName="vcf",fullName="VCF",doc="file to write to",required=true)
VCFWriter writer;
@Input(shortName="V",fullName="Variants",doc="variant tracks to use in calculation",required=true)
List<RodBinding<VariantContext>> variants;
private static double EPS_LOWER_LIMIT = Math.pow(10,-6.0);
private HashMap<Integer,Pair<Double,Double>> epsilonPosteriorCache = new HashMap<Integer,Pair<Double,Double>>(8192);
private HashMap<Integer,Double> logAC0Cache = new HashMap<Integer, Double>(8192);
private int QUANTIZATION_FACTOR = 1000;
public void initialize() {
Set<VCFHeaderLine> allHeaderLines = new HashSet<VCFHeaderLine>(1024);
for ( RodBinding<VariantContext> v : variants ) {
String trackName = v.getName();
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList(trackName));
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>(vcfHeaders.get(trackName).getMetaData());
}
allHeaderLines.add(new VCFInfoHeaderLine("AFB",2,VCFHeaderLineType.Float,"The 95% bounds on the allele "+
"frequency. First value=95% probability AF>x. Second value=95% probability AF<x."));
writer.writeHeader(new VCFHeader(allHeaderLines));
}
public Integer reduceInit() {
return 0;
}
@Override
public VariantContext map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext unused ) {
List<VariantContext> allVariants = tracker.getValues(variants);
if ( allVariants.size() == 0 ) {
return null;
}
List<Allele> alternateAlleles = getAllAlternateAlleles(allVariants);
VariantContextBuilder builder = new VariantContextBuilder(allVariants.get(0).subContextFromSamples(new TreeSet<String>()));
if ( alternateAlleles.size() > 1 ) {
logger.warn("Multiple Segregating Variants at position "+ref.getLocus().toString());
alternateAlleles.add(allVariants.get(0).getReference());
builder.alleles(alternateAlleles);
builder.filters(String.format("MULTIPLE_SEGREGATING[%s]", Utils.join(",",alternateAlleles)));
} else {
// get all the genotype likelihoods
GenotypesContext context = GenotypesContext.create();
int numNoCall = 0;
for ( VariantContext v : allVariants ) {
numNoCall += v.getNoCallCount();
context.addAll(v.getGenotypes());
}
builder.attribute("AFB",boundAlleleFrequency(getACPosteriors(context)));
}
return builder.make();
}
private List<Allele> getAllAlternateAlleles(List<VariantContext> variants) {
List<Allele> alleles = new ArrayList<Allele>(3); // some overhead
for ( VariantContext v : variants ) {
alleles.addAll(v.getAlternateAlleles());
}
return alleles;
}
@Override
public Integer reduce(VariantContext value, Integer sum) {
if ( value == null )
return sum;
writer.add(value);
return ++sum;
}
private int N_ITERATIONS = 1;
private double[] getACPosteriors(GenotypesContext gc) {
// note this uses uniform priors (!)
double[][] zeroPriors = new double[1][1+2*gc.size()];
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2,2*gc.size());
// todo -- allow multiple alleles here
for ( int i = 0; i < N_ITERATIONS; i ++ ) {
ExactAFCalculationModel.linearExactMultiAllelic(gc, 2, zeroPriors, result, false);
}
return result.log10AlleleFrequencyPosteriors[0];
}
private String boundAlleleFrequency(double[] ACLikelihoods) {
// note that no-calls are unnecessary: the ML likelihoods take nocalls into account as 0,0,0 GLs
// thus, for sites with K 100,40,0 likelihoods and M no-calls, the likelihoods will be
// agnostic between 2*K alleles through 2*(K+M) alleles - exactly what we want to marginalize over
// want to pick a lower limit x and upper limit y such that
// int_{f = x to y} sum_{c = 0 to 2*AN} P(AF=f | c, AN) df = 0.95
// int_{f=x to y} calculateAFPosterior(f) df = 0.95
// and that (y-x) is minimized
// this is done by quantizing [0,1] into small bins and, since the distribution is
// unimodal, greedily adding them until the probability is >= 0.95
throw new ReviewedStingException("This walker is unsupported, and is not fully implemented", new NotImplementedException("bound allele frequency not implemented"));
}
private double calculateAFPosterior(double[] likelihoods, double af) {
double[] probLiks = new double[likelihoods.length];
for ( int c = 0; c < likelihoods.length; c++) {
probLiks[c] = calculateAFPosterior(c,likelihoods.length,af);
}
return MathUtils.log10sumLog10(probLiks);
}
private double calculateAFPosterior(int ac, int n, double af) {
// evaluate the allele frequency posterior distribution at AF given AC observations of N chromosomes
switch ( ac ) {
case 0:
return logAC0Coef(n) + n*Math.log10(1 - af) - Math.log10(af);
case 1:
return Math.log10(n) + (n-1)*Math.log10(1-af) - n*Math.log10(1-EPS_LOWER_LIMIT);
case 2:
return Math.log10(n) + Math.log10(n-1) + Math.log10(af) + (n-2)*Math.log10(1-af) - Math.log10(1-(n-1)*EPS_LOWER_LIMIT) - (n-1)*Math.log10(EPS_LOWER_LIMIT);
default:
return (ac-1)*Math.log10(af)+ac*Math.log10( (double) n-ac)-(n-ac)*af*Math.log10(Math.E) - MathUtils.log10Gamma(ac);
}
}
private double logAC0Coef(int an) {
if ( ! logAC0Cache.containsKey(an) ) {
double coef = -Math.log10(EPS_LOWER_LIMIT);
for ( int k = 1; k <= an; k++ ) {
// note this should typically just be
// term = ( 1 - Math.pow(EPS_LOWER_LIMIT,k) ) * MathUtils.binomialCoefficient(an,k) / k
// but the 1-E term will just be 1, so we do the following to mitigate this problem
double binom = MathUtils.binomialCoefficient(an,k);
double eps_correction = EPS_LOWER_LIMIT*Math.pow(binom,1/k);
double term = binom/k - Math.pow(eps_correction,k);
if ( k % 2 == 0 ) {
coef += term;
} else {
coef -= term;
}
}
logAC0Cache.put(an,coef);
}
return logAC0Cache.get(an);
}
private double adaptiveSimpson(double[] likelihoods, double start, double stop, double err, int cap) {
double mid = (start + stop)/2;
double size = stop-start;
double fa = calculateAFPosterior(likelihoods,start);
double fb = calculateAFPosterior(likelihoods,mid);
double fc = calculateAFPosterior(likelihoods,stop);
double s = (size/6)*(fa + 4*fc + fb);
double h = simpAux(likelihoods,start,stop,err,s,fa,fb,fc,cap);
return h;
}
private double simpAux(double[] likelihoods, double a,double b,double eps,double s,double fa,double fb,double fc,double cap){
if ( s == 0 )
return -300.0;
double c = ( a + b )/2;
double h = b-a;
double d = (a + c)/2;
double e = (c + b)/2;
double fd = calculateAFPosterior(likelihoods, d);
double fe = calculateAFPosterior(likelihoods, e);
double s_l = (h/12)*(fa + 4*fd + fc);
double s_r = (h/12)*(fc + 4*fe + fb);
double s_2 = s_l + s_r;
if ( cap <= 0 || Math.abs(s_2 - s) <= 15*eps ){
return Math.log10(s_2 + (s_2 - s)/15.0);
}
return MathUtils.approximateLog10SumLog10(simpAux(likelihoods,a,c,eps/2,s_l,fa,fc,fd,cap-1),simpAux(likelihoods, c, b, eps / 2, s_r, fc, fb, fe, cap - 1));
}
}

View File

@ -118,6 +118,17 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false)
public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5;
/**
* Complementary argument to minIndelCnt. Only samples with at least this fraction of indel-containing reads will contribute
* to counting and overcoming the threshold minIndelCnt. This parameter ensures that in deep data you don't end
* up summing lots of super rare errors up to overcome the 5 read default threshold. Should work equally well for
* low-coverage and high-coverage samples, as low coverage samples with any indel containing reads should easily over
* come this threshold.
*/
@Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false)
public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25;
/**
* This argument informs the prior probability of having an indel at a site.
*/
@ -165,6 +176,7 @@ public class UnifiedArgumentCollection {
uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION;
uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING;
uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE;
uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY;
uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY;
uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY;

View File

@ -137,6 +137,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@Hidden
@Argument(fullName = "debug_file", shortName = "debug_file", doc = "File to print all of the annotated and detailed debugging output", required = false)
protected PrintStream verboseWriter = null;
@ -218,7 +219,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
// initialize the verbose writer
if ( verboseWriter != null )
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tAFposterior\tNormalizedPosterior");
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, 2*samples.size());

View File

@ -84,8 +84,8 @@ public class UnifiedGenotyperEngine {
private ThreadLocal<double[]> posteriorsArray = new ThreadLocal<double[]>();
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
private final double[][] log10AlleleFrequencyPriorsSNPs;
private final double[][] log10AlleleFrequencyPriorsIndels;
private final double[] log10AlleleFrequencyPriorsSNPs;
private final double[] log10AlleleFrequencyPriorsIndels;
// the priors object
private final GenotypePriors genotypePriorsSNPs;
@ -131,9 +131,8 @@ public class UnifiedGenotyperEngine {
this.annotationEngine = engine;
this.N = N;
log10AlleleFrequencyPriorsSNPs = new double[UAC.MAX_ALTERNATE_ALLELES][N+1];
log10AlleleFrequencyPriorsIndels = new double[UAC.MAX_ALTERNATE_ALLELES][N+1];
log10AlleleFrequencyPriorsSNPs = new double[N+1];
log10AlleleFrequencyPriorsIndels = new double[N+1];
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsSNPs, UAC.heterozygosity);
computeAlleleFrequencyPriors(N, log10AlleleFrequencyPriorsIndels, UAC.INDEL_HETEROZYGOSITY);
genotypePriorsSNPs = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.SNP);
@ -269,8 +268,8 @@ public class UnifiedGenotyperEngine {
// initialize the data for this thread if that hasn't been done yet
if ( afcm.get() == null ) {
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N));
posteriorsArray.set(new double[N + 2]);
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES));
posteriorsArray.set(new double[2]);
}
AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get();
@ -283,9 +282,7 @@ public class UnifiedGenotyperEngine {
generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext));
}
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
AFresult.reset();
List<Allele> allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult);
// is the most likely frequency conformation AC=0 for all alternate alleles?
@ -300,12 +297,11 @@ public class UnifiedGenotyperEngine {
// the genotyping model may have stripped it out
if ( indexOfAllele == -1 )
continue;
int indexOfBestAC = MathUtils.maxElementIndex(AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1]);
// if the most likely AC is not 0, then this is a good alternate allele to use;
// make sure to test against log10PosteriorOfAFzero since that no longer is an entry in the array
if ( indexOfBestAC != 0 && AFresult.log10AlleleFrequencyPosteriors[indexOfAllele-1][indexOfBestAC] > AFresult.log10PosteriorOfAFzero ) {
int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1];
// if the most likely AC is not 0, then this is a good alternate allele to use
if ( indexOfBestAC != 0 ) {
myAlleles.add(alternateAllele);
bestGuessIsRef = false;
}
@ -316,7 +312,6 @@ public class UnifiedGenotyperEngine {
}
// calculate p(f>0):
// because the likelihoods are marginalized for each alternate allele, we only need to compare log10PosteriorOfAFzero against any one of them
final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get());
final double PofF = 1.0 - normalizedPosteriors[0];
@ -324,18 +319,11 @@ public class UnifiedGenotyperEngine {
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
if ( Double.isInfinite(phredScaledConfidence) )
phredScaledConfidence = -10.0 * AFresult.log10PosteriorOfAFzero;
phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero();
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
double sum = AFresult.log10AlleleFrequencyPosteriors[0][0];
if ( sum == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
sum = 0.0;
for (int i = 1; i <= N; i++) {
if ( AFresult.log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
break;
sum += AFresult.log10AlleleFrequencyPosteriors[0][i];
}
final double sum = AFresult.getLog10PosteriorMatrixSum();
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
@ -364,7 +352,7 @@ public class UnifiedGenotyperEngine {
// print out stats if we have a writer
if ( verboseWriter != null && !limitedContext )
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, normalizedPosteriors, model);
printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model);
// *** note that calculating strand bias involves overwriting data structures, so we do that last
final HashMap<String, Object> attributes = new HashMap<String, Object>();
@ -378,29 +366,27 @@ public class UnifiedGenotyperEngine {
// the overall lod
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
double overallLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
// the forward lod
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, alternateAllelesToUse, false, model);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
AFresult.reset();
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.log10PosteriorOfAFzero;
double forwardLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, alternateAllelesToUse, false, model);
clearAFarray(AFresult.log10AlleleFrequencyLikelihoods);
clearAFarray(AFresult.log10AlleleFrequencyPosteriors);
AFresult.reset();
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.log10PosteriorOfAFzero;
double reverseLog10PofF = MathUtils.log10sumLog10(AFresult.log10AlleleFrequencyPosteriors[0], 0);
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum();
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -437,9 +423,9 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF));
}
private double[] generateNormalizedPosteriors(AlleleFrequencyCalculationResult AFresult, double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.log10PosteriorOfAFzero;
System.arraycopy(AFresult.log10AlleleFrequencyPosteriors[0], 0, normalizedPosteriors, 1, normalizedPosteriors.length-1);
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum();
return MathUtils.normalizeFromLog10(normalizedPosteriors);
}
@ -499,14 +485,6 @@ public class UnifiedGenotyperEngine {
return stratifiedContexts;
}
protected static void clearAFarray(double[][] AFs) {
for ( int i = 0; i < AFs.length; i++ ) {
for ( int j = 0; j < AFs[i].length; j++ ) {
AFs[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
}
private final static double[] binomialProbabilityDepthCache = new double[10000];
static {
for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) {
@ -551,7 +529,7 @@ public class UnifiedGenotyperEngine {
return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false);
}
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, double[] normalizedPosteriors, final GenotypeLikelihoodsCalculationModel.Model model) {
protected void printVerboseData(String pos, VariantContext vc, double PofF, double phredScaledConfidence, final GenotypeLikelihoodsCalculationModel.Model model) {
Allele refAllele = null, altAllele = null;
for ( Allele allele : vc.getAlleles() ) {
if ( allele.isReference() )
@ -574,11 +552,8 @@ public class UnifiedGenotyperEngine {
AFline.append(i + "/" + N + "\t");
AFline.append(String.format("%.2f\t", ((float)i)/N));
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
AFline.append("0.00000000\t");
else
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]));
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE()));
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP()));
verboseWriter.println(AFline.toString());
}
@ -648,31 +623,29 @@ public class UnifiedGenotyperEngine {
return null;
}
protected static void computeAlleleFrequencyPriors(final int N, final double[][] priors, final double theta) {
protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) {
// the dimension here is the number of alternate alleles; with e.g. 2 alternate alleles the prior will be theta^2 / i
for (int alleles = 1; alleles <= priors.length; alleles++) {
double sum = 0.0;
double sum = 0.0;
// for each i
for (int i = 1; i <= N; i++) {
double value = Math.pow(theta, alleles) / (double)i;
priors[alleles-1][i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[alleles-1][0] = Math.log10(1.0 - sum);
// for each i
for (int i = 1; i <= N; i++) {
final double value = theta / (double)i;
priors[i] = Math.log10(value);
sum += value;
}
// null frequency for AF=0 is (1 - sum(all other frequencies))
priors[0] = Math.log10(1.0 - sum);
}
protected double[][] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
protected double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Model model ) {
if (model.name().toUpperCase().contains("SNP"))
return log10AlleleFrequencyPriorsSNPs;
else if (model.name().toUpperCase().contains("INDEL"))
return log10AlleleFrequencyPriorsIndels;
else
throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
}
private static GenotypePriors createGenotypePriors( final GenotypeLikelihoodsCalculationModel.Model model ) {

View File

@ -25,19 +25,13 @@ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult;
import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
public class GLBasedSampleSelector extends SampleSelector {
Map<Integer,double[][]> numAllelePriorMatrix = new HashMap<Integer,double[][]>();
double[] flatPriors = null;
double referenceLikelihood;
public GLBasedSampleSelector(TreeSet<String> sm, double refLik) {
super(sm);
@ -53,9 +47,11 @@ public class GLBasedSampleSelector extends SampleSelector {
// now check to see (using EXACT model) whether this should be variant
// do we want to apply a prior? maybe user-spec?
double[][] flatPrior = createFlatPrior(vc.getAlleles());
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true);
if ( flatPriors == null ) {
flatPriors = new double[1+2*samples.size()];
}
AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size());
ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result);
// do we want to let this qual go up or down?
if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) {
return true;
@ -63,12 +59,4 @@ public class GLBasedSampleSelector extends SampleSelector {
return false;
}
private double[][] createFlatPrior(List<Allele> alleles) {
if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) {
numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]);
}
return numAllelePriorMatrix.get(alleles.size());
}
}

View File

@ -371,18 +371,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// find the comp
final VariantContext comp = findMatchingComp(eval, compSet);
HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>();
for ( VariantStratifier vs : stratificationObjects ) {
List<String> states = vs.getRelevantStates(ref, tracker, comp, compRod.getName(), eval, evalRod.getName(), sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
HashSet<StateKey> stateKeysHash = new HashSet<StateKey>(stateKeys);
for ( StateKey stateKey : stateKeysHash ) {
for ( StateKey stateKey : getApplicableStates(tracker, ref, eval, evalRod.getName(), comp, compRod.getName(), sampleName) ) {
NewEvaluationContext nec = evaluationContexts.get(stateKey);
// eval against the comp
@ -410,6 +399,73 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
return null;
}
// private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
// final ReferenceContext ref,
// final VariantContext eval,
// final String evalName,
// final VariantContext comp,
// final String compName,
// final String sampleName ) {
// Set<StateKey> oldKeys = new HashSet<StateKey>(Utils.makeCollection(getApplicableStatesOld(tracker, ref, eval, evalName, comp, compName, sampleName)));
//
// int n = 0;
// for ( final StateKey newKey : getApplicableStatesNew(tracker, ref, eval, evalName, comp, compName, sampleName) ) {
// n++;
// if ( ! oldKeys.contains(newKey) )
// throw new ReviewedStingException("New key " + newKey + " missing from previous algorithm");
// }
//
// if ( n != oldKeys.size() )
// throw new ReviewedStingException("New keyset has " + n + " elements but previous algorithm had " + oldKeys.size());
//
// return oldKeys;
// }
// private Iterable<StateKey> getApplicableStatesNew(final RefMetaDataTracker tracker,
// final ReferenceContext ref,
// final VariantContext eval,
// final String evalName,
// final VariantContext comp,
// final String compName,
// final String sampleName ) {
// // todo -- implement optimized version
// }
/**
* Given specific eval and comp VCs and the sample name, return an iterable
* over all of the applicable state keys.
*
* See header of StateKey for performance problems...
*
* @param tracker
* @param ref
* @param eval
* @param evalName
* @param comp
* @param compName
* @param sampleName
* @return
*/
private Iterable<StateKey> getApplicableStates(final RefMetaDataTracker tracker,
final ReferenceContext ref,
final VariantContext eval,
final String evalName,
final VariantContext comp,
final String compName,
final String sampleName ) {
final HashMap<VariantStratifier, List<String>> stateMap = new HashMap<VariantStratifier, List<String>>(stratificationObjects.size());
for ( final VariantStratifier vs : stratificationObjects ) {
List<String> states = vs.getRelevantStates(ref, tracker, comp, compName, eval, evalName, sampleName);
stateMap.put(vs, states);
}
ArrayList<StateKey> stateKeys = new ArrayList<StateKey>();
variantEvalUtils.initializeStateKeys(stateMap, null, null, stateKeys);
return new HashSet<StateKey>(stateKeys);
}
@Requires({"comp != null", "evals != null"})
private boolean compHasMatchingEval(final VariantContext comp, final Collection<VariantContext> evals) {
// find all of the matching comps
@ -482,11 +538,13 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public void onTraversalDone(Integer result) {
logger.info("Finalizing variant report");
for ( StateKey stateKey : evaluationContexts.keySet() ) {
NewEvaluationContext nec = evaluationContexts.get(stateKey);
for ( Map.Entry<StateKey, NewEvaluationContext> ecElt : evaluationContexts.entrySet() ) {
final StateKey stateKey = ecElt.getKey();
final NewEvaluationContext nec = ecElt.getValue();
for ( VariantEvaluator ve : nec.getEvaluationClassList().values() ) {
ve.finalizeEvaluation();
final String veName = ve.getSimpleName(); // ve.getClass().getSimpleName();
AnalysisModuleScanner scanner = new AnalysisModuleScanner(ve);
Map<Field, DataPoint> datamap = scanner.getData();
@ -498,7 +556,7 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
if (field.get(ve) instanceof TableType) {
TableType t = (TableType) field.get(ve);
final String subTableName = ve.getClass().getSimpleName() + "." + field.getName();
final String subTableName = veName + "." + field.getName();
final DataPoint dataPointAnn = datamap.get(field);
GATKReportTable table;
@ -539,11 +597,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
}
}
} else {
GATKReportTable table = report.getTable(ve.getClass().getSimpleName());
GATKReportTable table = report.getTable(veName);
for ( VariantStratifier vs : stratificationObjects ) {
String columnName = vs.getName();
final String columnName = vs.getName();
table.set(stateKey.toString(), columnName, stateKey.get(vs.getName()));
}

View File

@ -8,6 +8,11 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
public abstract class VariantEvaluator {
private VariantEvalWalker walker;
private final String simpleName;
protected VariantEvaluator() {
this.simpleName = getClass().getSimpleName();
}
public void initialize(VariantEvalWalker walker) {
this.walker = walker;
@ -90,4 +95,8 @@ public abstract class VariantEvaluator {
protected static String formattedRatio(final int num, final int denom) {
return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom));
}
public String getSimpleName() {
return simpleName;
}
}

View File

@ -27,6 +27,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
@ -40,6 +41,7 @@ import java.util.Map;
* the object, a Mashalling object can serialize or deserialize a analysis module.
*/
public class AnalysisModuleScanner {
final private static Map<String, Annotation[]> annotationCache = new HashMap<String, Annotation[]>();
// what we extracted from the class
private Map<Field, DataPoint> datums = new LinkedHashMap<Field, DataPoint>(); // the data we've discovered
@ -84,12 +86,22 @@ public class AnalysisModuleScanner {
// get the fields from the class, and extract
for ( Class superCls = cls; superCls != null; superCls=superCls.getSuperclass() ) {
for (Field f : superCls.getDeclaredFields())
for (Annotation annotation : f.getAnnotations()) {
for (Annotation annotation : getAnnotations(f)) {
if (annotation.annotationType().equals(DataPoint.class))
datums.put(f,(DataPoint) annotation);
}
}
}
private Annotation[] getAnnotations(final Field field) {
final String fieldName = field.toString();
Annotation[] annotations = annotationCache.get(fieldName);
if ( annotations == null ) {
annotations = field.getAnnotations();
annotationCache.put(fieldName, annotations);
}
return annotations;
}
/**
*

View File

@ -36,6 +36,16 @@ public class NewEvaluationContext extends HashMap<VariantStratifier, String> {
return new TreeMap<String, VariantEvaluator>(evaluationInstances);
}
public StateKey makeStateKey() {
Map<String, String> map = new HashMap<String, String>(size());
for (Map.Entry<VariantStratifier, String> elt : this.entrySet() ) {
map.put(elt.getKey().getName(), elt.getValue());
}
return new StateKey(map);
}
public void apply(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context, VariantContext comp, VariantContext eval) {
for ( final VariantEvaluator evaluation : evaluationInstances.values() ) {
// the other updateN methods don't see a null context

View File

@ -3,25 +3,102 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.util;
import java.util.Map;
import java.util.TreeMap;
public class StateKey extends TreeMap<String, String> {
// public int hashCode() {
// int hashCode = 1;
//
// for (final Map.Entry<String,String> pair : this.entrySet()) {
// hashCode *= pair.getKey().hashCode() + pair.getValue().hashCode();
// }
//
// return hashCode;
// }
/**
* A final constant class representing the specific state configuration
* for a VariantEvaluator instance.
*
* The way this is currently implemented is by a map from the name of a VariantStratification to a
* specific state string. For example, the stratification Novelty has states all, known, novel. A
* specific variant and comp would be tagged as "known" by the stratification, and this could be
* represented here by the map (Novelty -> known).
*
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
* TODO -- PERFORMANCE PROBLEM -- MAD 03/27/12
*
* I've been staring at this state key code for a while. It's just not right, and expensive to boot.
* Here are my thoughts for future work. The state key is both a key with specific state values for
* every stratification. For example, (known, sample1, ac=1). This capability is used in some places,
* such as below, to return a set of all states that should be updated given the eval and comp
* VCs. In principle there are a finite set of such combinations (the product of all states for all active
* stratifications at initialization). We could represent such keys as integers into the set of all combinations.
*
* Note that all of the code that manipulates these things is just terrible. It's all string manipulation and
* HashMaps. Since we are effectively always squaring off our VE analyses (i.e., we have a table with
* all variable values for all stratification combinations) it doesn't make sense to allow so much dynamicism. Instead
* we should just upfront create a giant table indexed by integer keys, and manage data via a simple map from
* specific strat state to this key.
*
* The reason this is so important is that >80% of the runtime of VE with VCFs with >1000 samples is spent in
* the initializeStateKey function. Instead, we should have code that looks like:
*
* init:
* allStates <- initializeCombinationalStateSpace
*
* map:
* for each eval / comp pair:
* for each relevantState based on eval / comp:
* allStates[relevantState].update(eval, comp)
*
*
*/
public final class StateKey {
/** High-performance cache of the toString operation for a constant class */
private final String string;
private final TreeMap<String, String> states;
public String toString() {
String value = "";
public StateKey(final Map<String, String> states) {
this.states = new TreeMap<String, String>(states);
this.string = formatString();
}
for ( final String key : this.keySet() ) {
//value += "\tstate " + key + ":" + this.get(key) + "\n";
value += String.format("%s:%s;", key, this.get(key));
public StateKey(final StateKey toOverride, final String keyOverride, final String valueOverride) {
if ( toOverride == null ) {
this.states = new TreeMap<String, String>();
} else {
this.states = new TreeMap<String, String>(toOverride.states);
}
return value;
this.states.put(keyOverride, valueOverride);
this.string = formatString();
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final StateKey stateKey = (StateKey) o;
if (states != null ? !states.equals(stateKey.states) : stateKey.states != null) return false;
return true;
}
@Override
public int hashCode() {
return states.hashCode();
}
@Override
public String toString() {
return string;
}
private final String formatString() {
StringBuilder b = new StringBuilder();
for ( Map.Entry<String, String> entry : states.entrySet() ) {
b.append(String.format("%s:%s;", entry.getKey(), entry.getValue()));
}
return b.toString();
}
// TODO -- might be slow because of tree map
public String get(final String key) {
return states.get(key);
}
}

View File

@ -214,20 +214,9 @@ public class VariantEvalUtils {
ecs.putAll(initializeEvaluationContexts(stratificationObjects, evaluationObjects, newStratStack, nec));
}
} else {
HashMap<StateKey, NewEvaluationContext> necs = new HashMap<StateKey, NewEvaluationContext>();
StateKey stateKey = new StateKey();
for (VariantStratifier vs : ec.keySet()) {
String state = ec.get(vs);
stateKey.put(vs.getName(), state);
}
final StateKey stateKey = ec.makeStateKey();
ec.addEvaluationClassList(variantEvalWalker, stateKey, evaluationObjects);
necs.put(stateKey, ec);
return necs;
return new HashMap<StateKey, NewEvaluationContext>(Collections.singletonMap(stateKey, ec));
}
return ecs;
@ -428,14 +417,8 @@ public class VariantEvalUtils {
HashMap<VariantStratifier, List<String>> oneSetOfStates = newStateStack.pop();
VariantStratifier vs = oneSetOfStates.keySet().iterator().next();
for (String state : oneSetOfStates.get(vs)) {
StateKey newStateKey = new StateKey();
if (stateKey != null) {
newStateKey.putAll(stateKey);
}
newStateKey.put(vs.getName(), state);
for (final String state : oneSetOfStates.get(vs)) {
final StateKey newStateKey = new StateKey(stateKey, vs.getName(), state);
initializeStateKeys(stateMap, newStateStack, newStateKey, stateKeys);
}
} else {

View File

@ -182,7 +182,6 @@ public class Haplotype {
public static LinkedHashMap<Allele,Haplotype> makeHaplotypeListFromAlleles(List<Allele> alleleList, int startPos, ReferenceContext ref,
final int haplotypeSize, final int numPrefBases) {
LinkedHashMap<Allele,Haplotype> haplotypeMap = new LinkedHashMap<Allele,Haplotype>();
Allele refAllele = null;
@ -215,13 +214,13 @@ public class Haplotype {
// Create location for all haplotypes
int startLoc = ref.getWindow().getStart() + startIdxInReference;
int stopLoc = startLoc + haplotypeSize-1;
final int startLoc = ref.getWindow().getStart() + startIdxInReference;
final int stopLoc = startLoc + haplotypeSize-1;
GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc);
final GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc);
for (Allele a : alleleList) {
for (final Allele a : alleleList) {
byte[] alleleBases = a.getBases();
// use string concatenation
@ -315,5 +314,4 @@ public class Haplotype {
return (fallsInsideDeletion ? -1 : readBases);
}
}

View File

@ -234,6 +234,9 @@ public class MathUtils {
double sum = 0.0;
double maxValue = Utils.findMaxEntry(log10p);
if(maxValue == Double.NEGATIVE_INFINITY)
return sum;
for (int i = start; i < finish; i++) {
sum += Math.pow(10.0, log10p[i] - maxValue);
}
@ -1508,6 +1511,24 @@ public class MathUtils {
return result;
}
/** Same routine, unboxed types for efficiency
*
* @param x
* @param y
* @return Vector of same length as x and y so that z[k] = x[k]+y[k]
*/
public static double[] vectorSum(double[]x, double[] y) {
if (x.length != y.length)
throw new ReviewedStingException("BUG: Lengths of x and y must be the same");
double[] result = new double[x.length];
for (int k=0; k <x.length; k++)
result[k] = x[k]+y[k];
return result;
}
public static <E extends Number> Double[] scalarTimesVector(E a, E[] v1) {
Double result[] = new Double[v1.length];

View File

@ -703,4 +703,11 @@ public class Utils {
return programRecord;
}
public static <E> Collection<E> makeCollection(Iterable<E> iter) {
Collection<E> list = new ArrayList<E>();
for (E item : iter) {
list.add(item);
}
return list;
}
}

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
@ -18,7 +17,7 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
static double[] AA1, AB1, BB1;
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
static final int numSamples = 3;
static double[][] priors = new double[2][2*numSamples+1]; // flat priors
static double[] priors = new double[2*numSamples+1]; // flat priors
@BeforeSuite
public void before() {
@ -83,26 +82,16 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples);
for ( int i = 0; i < 2; i++ ) {
for ( int j = 0; j < 2*numSamples+1; j++ ) {
result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
}
}
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false);
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result);
int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]);
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
if ( result.log10AlleleFrequencyPosteriors[0][0] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) {
Assert.assertTrue(calculatedAlleleCount == expectedAlleleCount || result.log10AlleleFrequencyPosteriors[0][calculatedAlleleCount] < result.log10PosteriorOfAFzero);
} else {
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
}
}
}

View File

@ -17,8 +17,8 @@ import java.util.Map;
public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129;
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
// --------------------------------------------------------------------------------------------------------------
@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232"));
Arrays.asList("ac3737b4212f634a03c640c83f670955"));
executeTest("test MultiSample Pilot1", spec);
}
@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1,
Arrays.asList("0de4aeed6a52f08ed86a7642c812478b"));
Arrays.asList("6f70dfbaf3bb70c702f9e9dbacd67c17"));
executeTest("test Multiple SNP alleles", spec);
}
@ -138,7 +138,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("6172d2f3d370132f4c57a26aa94c256e"));
Arrays.asList("e9d23a08472e4e27b4f25e844f5bad57"));
executeTest("test SLOD", spec);
}
@ -146,8 +146,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testOutputParameter() {
HashMap<String, String> e = new HashMap<String, String>();
e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "553f6b4cbf380885bec9dd634cf68742" );
e.put( "--output_mode EMIT_ALL_SITES", "6d8624e45ad9dae5803ac705b39e4ffa" );
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "ecf92054c1e4bd9d6529b8002d385165" );
e.put( "--output_mode EMIT_ALL_SITES", "119c9fcefbc69e0fc10b1dc52f6438e3" );
for ( Map.Entry<String, String> entry : e.entrySet() ) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
@ -300,13 +300,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMultiSampleIndels() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("52340d578a708fa709b69ce48987bc9d"));
Arrays.asList("fbc48d7d9e622c9af7922f91bc858151"));
List<File> result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst();
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1,
Arrays.asList("9566c7abef5ee5829a516d90445b347f"));
Arrays.asList("94c52ef70e44709ccd947d32e9c27da9"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -335,4 +335,37 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
UserException.class);
executeTest("testSnpEffAnnotationRequestedWithoutRodBinding", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing SnpEff
//
// --------------------------------------------------------------------------------------------------------------
final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation
+ "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030";
@Test
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("f08ff07ad49d388198c1887baad05977"));
executeTest("test minIndelFraction 0.0", spec);
}
@Test
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("a0945fd21369aaf68c7f1d96dbb930d1"));
executeTest("test minIndelFraction 0.25", spec);
}
@Test
public void testMinIndelFraction100() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 1", 1,
Arrays.asList("50fe9a4c5633f6395b45d9ec1e00d56a"));
executeTest("test minIndelFraction 1.0", spec);
}
}

View File

@ -0,0 +1,90 @@
package org.broadinstitute.sting.gatk.walkers.validation;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: delangel
* Date: 3/26/12
* Time: 3:29 PM
* To change this template use File | Settings | File Templates.
*/
public class ValidationSiteSelectorIntegrationTest extends WalkerTest {
public static String baseTestString(String args) {
return "-T ValidationSiteSelector -R " + b36KGReference + " -L 1 -o %s -NO_HEADER -numSites 100 " + args;
}
private static String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf";
private static String samplesFile = validationDataLocation + "SelectVariants.samples.txt";
private static String samplePrefix = " -sf " + samplesFile;
private static String freqUnif = " --frequencySelectionMode UNIFORM ";
private static String freqAF = " --frequencySelectionMode KEEP_AF_SPECTRUM ";
private static String sampleNone = " -sampleMode NONE ";
private static String sampleGT = samplePrefix+" -sampleMode POLY_BASED_ON_GT ";
private static String sampleGL = samplePrefix+" -sampleMode POLY_BASED_ON_GL -samplePNonref 0.95";
@Test(enabled=true)
public void testNoSampleSelectionFreqUniform() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleNone + freqUnif + "--variant " + testfile),
1,
Arrays.asList("d49baeb8000a426c172ce1d81eb37963")
);
executeTest("testNoSampleSelectionFreqUniform--" + testfile, spec);
}
@Test(enabled=true)
public void testNoSampleSelectionFreqAF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleNone + freqAF + "--variant " + testfile),
1,
Arrays.asList("0fb0d015d462c34514fc7e96beea5f56")
);
executeTest("testNoSampleSelectionFreqAF--" + testfile, spec);
}
@Test(enabled=true)
public void testPolyGTFreqUniform() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGT + freqUnif + "--variant " + testfile),
1,
Arrays.asList("0672854299d42ea8af906976a3849ae6")
);
executeTest("testPolyGTFreqUniform--" + testfile, spec);
}
@Test(enabled=true)
public void testPolyGTFreqAF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGT + freqAF + "--variant " + testfile),
1,
Arrays.asList("5bdffda1a063d0bddd6b236854ec627d")
);
executeTest("testPolyGTFreqAF--" + testfile, spec);
}
@Test(enabled=true)
public void testPolyGLFreqAF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(sampleGL + freqAF + "--variant " + testfile),
1,
Arrays.asList("35ef16aa41303606a4b94f7b88bd9aa8")
);
executeTest("testPolyGLFreqAF--" + testfile, spec);
}
}