Merge pull request #628 from broadinstitute/vrr_per_read_allele_likelihoods

ReadLikelihoods container using arrays rather than Maps of Maps of Maps
This commit is contained in:
Valentin Ruano Rubio 2014-08-11 20:32:37 -04:00
commit 1558f9c49a
32 changed files with 3059 additions and 841 deletions

View File

@ -99,6 +99,11 @@ public class StandardCallerArgumentCollection implements Cloneable {
@Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"<SampleID><TAB><Contamination>\" (Contamination is double) per line; No header.", required = false) @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"<SampleID><TAB><Contamination>\" (Contamination is double) per line; No header.", required = false)
public File CONTAMINATION_FRACTION_FILE = null; public File CONTAMINATION_FRACTION_FILE = null;
/**
* Indicates whether there is some sample contamination present.
*/
private boolean sampleContaminationWasLoaded = false;
/** /**
* *
* @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION
@ -106,15 +111,32 @@ public class StandardCallerArgumentCollection implements Cloneable {
public Map<String,Double> getSampleContamination(){ public Map<String,Double> getSampleContamination(){
//make sure that the default value is set up right //make sure that the default value is set up right
sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0)
sampleContaminationWasLoaded = true;
return Collections.unmodifiableMap(sampleContamination); return Collections.unmodifiableMap(sampleContamination);
} }
public void setSampleContamination(DefaultHashMap<String, Double> sampleContamination) { public void setSampleContamination(DefaultHashMap<String, Double> sampleContamination) {
this.sampleContamination.clear(); this.sampleContamination.clear();
this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0;
if (!sampleContaminationWasLoaded)
for (final Double d : sampleContamination.values())
if (!Double.isNaN(d) && d > 0.0) {
sampleContaminationWasLoaded = true;
break;
}
this.sampleContamination.putAll(sampleContamination); this.sampleContamination.putAll(sampleContamination);
this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION);
} }
/**
* Returns true if there is some sample contamination present, false otherwise.
* @return {@code true} iff there is some sample contamination
*/
public boolean isSampleContaminationPresent() {
return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded;
}
//Needs to be here because it uses CONTAMINATION_FRACTION //Needs to be here because it uses CONTAMINATION_FRACTION
private DefaultHashMap<String,Double> sampleContamination = new DefaultHashMap<String,Double>(CONTAMINATION_FRACTION); private DefaultHashMap<String,Double> sampleContamination = new DefaultHashMap<String,Double>(CONTAMINATION_FRACTION);

View File

@ -43,16 +43,16 @@
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/ */
package org.broadinstitute.gatk.tools.walkers.haplotypecaller; package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.SeqGraph;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.exceptions.GATKException; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM;
import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM;
import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.io.File; import java.io.File;
@ -112,17 +112,19 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc
debugMode = debugHaplotypeGraphAndLikelihoods ? DebugMode.EXTRA_DEBUG : debug ? DebugMode.DEBUG : DebugMode.NONE; debugMode = debugHaplotypeGraphAndLikelihoods ? DebugMode.EXTRA_DEBUG : debug ? DebugMode.DEBUG : DebugMode.NONE;
} }
@Override @Override
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map<String, List<GATKSAMRecord>> perSampleReadList) { public ReadLikelihoods<Haplotype> computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final List<String> samples, final Map<String, List<GATKSAMRecord>> perSampleReadList) {
final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine = final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine =
new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet, new GraphBasedLikelihoodCalculationEngineInstance(assemblyResultSet,
hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution); hmm,log10GlobalReadMismappingRate,heterogeneousKmerSizeResolution);
final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList(); final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList();
final List<Haplotype> supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList(); final List<Haplotype> supportedHaplotypes = graphLikelihoodEngine.getHaplotypeList();
if (supportedHaplotypes.size() != haplotypes.size()) logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size()); final ReadLikelihoods<Haplotype> result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes, samples, perSampleReadList);
final Map<String,PerReadAlleleLikelihoodMap> result = graphLikelihoodEngine.computeReadLikelihoods(supportedHaplotypes, if (supportedHaplotypes.size() != haplotypes.size()) {
perSampleReadList ); logger.warn("Some haplotypes were drop due to missing route on the graph (supported / all): " + supportedHaplotypes.size() + "/" + haplotypes.size());
result.addMissingAlleles(haplotypes,Double.NEGATIVE_INFINITY);
}
if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result); if (debugMode != DebugMode.NONE) graphLikelihoodDebugDumps(assemblyResultSet.getRegionForGenotyping(), graphLikelihoodEngine,result);
return result; return result;
} }
@ -131,7 +133,7 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc
* A few debug messages associated with the GraphBased likelihoods engine. * A few debug messages associated with the GraphBased likelihoods engine.
*/ */
private void graphLikelihoodDebugDumps(final ActiveRegion originalActiveRegion, final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine, private void graphLikelihoodDebugDumps(final ActiveRegion originalActiveRegion, final GraphBasedLikelihoodCalculationEngineInstance graphLikelihoodEngine,
final Map<String, PerReadAlleleLikelihoodMap> result) { final ReadLikelihoods<Haplotype> result) {
if (graphLikelihoodEngine.hasCycles()) if (graphLikelihoodEngine.hasCycles())
logger.debug("Resulting haplotype graph combining several kmer sizes has cycles"); logger.debug("Resulting haplotype graph combining several kmer sizes has cycles");
else if (graphLikelihoodEngine.haplotypeGraph.hasNonReferenceEnds()) else if (graphLikelihoodEngine.haplotypeGraph.hasNonReferenceEnds())
@ -144,14 +146,14 @@ public class GraphBasedLikelihoodCalculationEngine implements ReadLikelihoodCalc
sq.simplifyGraph(); sq.simplifyGraph();
sq.printGraph(new File(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeSeqGraph.dot"), 10000); sq.printGraph(new File(originalActiveRegion.getLocation() + "-" + graphLikelihoodEngine.getKmerSize() + "-haplotypeSeqGraph.dot"), 10000);
try { try {
FileWriter fw = new FileWriter(new File(originalActiveRegion.getLocation() + "-likelihoods.txt")); final FileWriter fw = new FileWriter(new File(originalActiveRegion.getLocation() + "-likelihoods.txt"));
PrintWriter pw = new PrintWriter(fw); final PrintWriter pw = new PrintWriter(fw);
//Note: we only output the first sample likelihoods, perhaps should output all of them but for debugging this is normally what is needed. //Note: we only output the first sample likelihoods, perhaps should output all of them but for debugging this is normally what is needed.
pw.println(result.entrySet().iterator().next().getValue().toString()); pw.println(result.sampleMatrix(0)); // need to actually implement a proper toString for the SampleMatrix.
pw.close(); pw.close();
fw.close(); fw.close();
} catch (Exception ex) { } catch (final Exception ex) {
throw new GATKException("", ex); throw new IllegalStateException("", ex);
} }
} }
} }

View File

@ -57,6 +57,7 @@ import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.collections.CountSet; import org.broadinstitute.gatk.utils.collections.CountSet;
import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM; import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
@ -215,6 +216,7 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* *
* @return {@code true} iff so. * @return {@code true} iff so.
*/ */
@SuppressWarnings("unused")
public boolean hasVariation() { public boolean hasVariation() {
return hasVariation; return hasVariation;
} }
@ -231,27 +233,24 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}.
* The value maps can be potentially empty though. * The value maps can be potentially empty though.
*/ */
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( public ReadLikelihoods<Haplotype> computeReadLikelihoods(final List<Haplotype> haplotypes, final List<String> samples,
final List<Haplotype> haplotypes,
final Map<String, List<GATKSAMRecord>> perSampleReadList) { final Map<String, List<GATKSAMRecord>> perSampleReadList) {
// General preparation on the input haplotypes: // General preparation on the input haplotypes:
Collections.sort(haplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR); final ReadLikelihoods<Haplotype> result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList);
final Map<Haplotype, Allele> alleleVersions = new LinkedHashMap<>(haplotypes.size()); final List<Haplotype> sortedHaplotypes = new ArrayList<>(haplotypes);
for (final Haplotype haplotype : haplotypes) Collections.sort(sortedHaplotypes, Haplotype.ALPHANUMERICAL_COMPARATOR);
alleleVersions.put(haplotype, Allele.create(haplotype,haplotype.isReference()));
// The actual work: // The actual work:
final HashMap<String, PerReadAlleleLikelihoodMap> result = new HashMap<>(perSampleReadList.size()); final int sampleCount = result.sampleCount();
for (final Map.Entry<String, List<GATKSAMRecord>> e : perSampleReadList.entrySet()) { for (int s = 0; s < sampleCount; s++) {
final String sample = e.getKey(); final List<GATKSAMRecord> sampleReads = result.sampleReads(s);
final List<GATKSAMRecord> reads = e.getValue();
final Set<GATKSAMRecord> mayNeedAdjustment = new HashSet<>(reads.size());
// Get the cost/likelihood of each read at relevant subpaths on the tree: // Get the cost/likelihood of each read at relevant subpaths on the tree:
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsByEndingVertex = calculatePathCostsByRead(reads, mayNeedAdjustment); final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsByEndingVertex = calculatePathCostsByRead(sampleReads);
// Create the resulting per-read maps: // Create the resulting per-read maps:
final PerReadAlleleLikelihoodMap prallm = calculatePerReadAlleleLikelihoodMap(haplotypes, costsByEndingVertex, alleleVersions); calculatePerReadAlleleLikelihoodMap(costsByEndingVertex,result.sampleMatrix(s) );
result.put(sample, prallm);
} }
result.normalizeLikelihoods(true,log10globalReadMismappingRate);
logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + ""); logger.debug("Likelihood analysis summary: reads anchored " + anchoredReads + "/" + (anchoredReads + nonAnchoredReads) + "");
return result; return result;
} }
@ -263,7 +262,6 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* @param fileName name of the output file. * @param fileName name of the output file.
*/ */
public void printGraph(final String fileName) { public void printGraph(final String fileName) {
if (haplotypeGraph != null)
haplotypeGraph.printGraph(fileName); haplotypeGraph.printGraph(fileName);
} }
@ -281,36 +279,24 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* *
* @return {@code true} iff so. * @return {@code true} iff so.
*/ */
@SuppressWarnings("unused")
public boolean hasCycles() { public boolean hasCycles() {
// It is set to null if it contained cycles. return haplotypeGraph.hasCycles();
return haplotypeGraph == null;
} }
/** /**
* Builds the result per-read allele likelihood map. * Builds the result per-read allele likelihood map.
* *
* @param haplotypes haplotypes to process. * @param costsEndingByVertex Read vs haplotype graph sub-paths cost indexed by ending vertex.
* @param costsEndingByVertex Read vs haplotype graph subpaths cost indexed by ending vertex. * @param likelihoods matrix where to set the likelihoods where the first index in the haplotype's and the second
* @param alleleVersions map between haplotypes and the corresponding allele. * the read.
* @return never {@code null} although perhaps empty.
*/ */
protected PerReadAlleleLikelihoodMap calculatePerReadAlleleLikelihoodMap( protected void calculatePerReadAlleleLikelihoodMap(final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsEndingByVertex,
final Collection<Haplotype> haplotypes, final ReadLikelihoods.Matrix<Haplotype> likelihoods) {
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsEndingByVertex, final Map<Haplotype, Allele> alleleVersions) { final int alleleCount = likelihoods.alleleCount();
for (int h = 0; h < alleleCount; h++)
final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(h, likelihoods, costsEndingByVertex);
if (haplotypeGraph == null)
return result;
final Map<GATKSAMRecord, Double> maxAlleleLogLk = new HashMap<>(anchoredReads + nonAnchoredReads + 10);
final Set<Haplotype> supportedHaplotypes = new LinkedHashSet<>(haplotypeGraph.getHaplotypes());
supportedHaplotypes.retainAll(haplotypes);
for (final Haplotype haplotype : supportedHaplotypes)
calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(haplotype, alleleVersions, result, maxAlleleLogLk, costsEndingByVertex);
makeLikelihoodAdjustment(alleleVersions, result, maxAlleleLogLk.keySet(), maxAlleleLogLk);
applyGlobalReadMismappingRate(alleleVersions, result, maxAlleleLogLk);
return result;
} }
/** /**
@ -322,25 +308,24 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* "likelihood". * "likelihood".
* </p> * </p>
* *
* @param haplotype the target haplotype * @param haplotypeIndex the target haplotype index in the {@code likelihoods} matrix.
* @param alleleVersions allele version of the haplotypes. These are the ones to be used in the final output. * @param likelihoods matrix of likelihoods.
* @param result target where to add the read-vs-haplotype likelihoods.
* @param maxAlleleLogLk where to place the maximum likelihood achieve on any haplotype for each read.
* @param costsEndingByVertex read costs assorted by their end vertex. * @param costsEndingByVertex read costs assorted by their end vertex.
*/ */
private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final Haplotype haplotype, private void calculatePerReadAlleleLikelihoodMapHaplotypeProcessing(final int haplotypeIndex,
final Map<Haplotype, Allele> alleleVersions, final ReadLikelihoods.Matrix<Haplotype> likelihoods,
final PerReadAlleleLikelihoodMap result,
final Map<GATKSAMRecord, Double> maxAlleleLogLk,
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsEndingByVertex) { final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> costsEndingByVertex) {
final Haplotype haplotype = likelihoods.allele(haplotypeIndex);
final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype); final HaplotypeRoute haplotypeRoute = haplotypeGraph.getHaplotypeRoute(haplotype);
final Set<MultiDeBruijnVertex> haplotypeVertices = haplotypeRoute.vertexSet(); final Set<MultiDeBruijnVertex> haplotypeVertices = haplotypeRoute.vertexSet();
final Map<GATKSAMRecord, ReadCost> readCostByRead = new HashMap<>(); final Map<GATKSAMRecord, ReadCost> readCostByRead = new HashMap<>();
final Set<MultiDeBruijnVertex> visitedVertices = new HashSet<>(haplotypeVertices.size()); final Set<MultiDeBruijnVertex> visitedVertices = new HashSet<>(haplotypeVertices.size());
final List<MultiSampleEdge> edgeList = haplotypeRoute.getEdges(); final List<MultiSampleEdge> edgeList = haplotypeRoute.getEdges();
MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex(); MultiDeBruijnVertex currentVertex = haplotypeRoute.getFirstVertex();
Route<MultiDeBruijnVertex, MultiSampleEdge> pathSoFar = new Route<>(currentVertex, haplotypeGraph); Route<MultiDeBruijnVertex, MultiSampleEdge> pathSoFar = new Route<>(currentVertex, haplotypeGraph);
final Iterator<MultiSampleEdge> edgeIterator = edgeList.iterator(); final Iterator<MultiSampleEdge> edgeIterator = edgeList.iterator();
while (true) { while (true) {
visitedVertices.add(currentVertex); visitedVertices.add(currentVertex);
final Set<ReadSegmentCost> finishingAtElementCostSet = costsEndingByVertex.get(currentVertex); final Set<ReadSegmentCost> finishingAtElementCostSet = costsEndingByVertex.get(currentVertex);
@ -351,15 +336,12 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
currentVertex = pathSoFar.getLastVertex(); currentVertex = pathSoFar.getLastVertex();
} }
final List<ReadCost> readCosts = new ArrayList<>(readCostByRead.values()); int readIndex = 0;
Collections.sort(readCosts, ReadCost.COMPARATOR); for (final GATKSAMRecord read : likelihoods.reads()) {
for (final ReadCost rc : readCosts) final ReadCost rc = readCostByRead.get(read);
result.add(rc.read, alleleVersions.get(haplotype), rc.getCost()); //if (rc != null)
likelihoods.set(haplotypeIndex,readIndex,rc == null ? Double.NEGATIVE_INFINITY : rc.getCost());
for (final ReadCost rc : readCosts) { readIndex++;
final Double currentMax = maxAlleleLogLk.get(rc.read);
if (currentMax == null || currentMax < rc.getCost())
maxAlleleLogLk.put(rc.read, rc.getCost());
} }
} }
@ -443,33 +425,6 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
} }
} }
/**
* Makes sure that the reference allele likelihood is not too much smaller that the best alternative allele.
* The justification of this constraint is explained in
* {@link PairHMMLikelihoodCalculationEngine#computeDiploidHaplotypeLikelihoods}.
*
* @param alleleVersions correspondence between input haplotypes and output alleles.
* @param result the target result map.
* @param maxAlleleLogLk for each read indicates the likelihood of the best alternative allele.
*/
private void applyGlobalReadMismappingRate(final Map<Haplotype, Allele> alleleVersions,
final PerReadAlleleLikelihoodMap result,
final Map<GATKSAMRecord, Double> maxAlleleLogLk) {
if (!Double.isNaN(log10globalReadMismappingRate) && !Double.isInfinite(log10globalReadMismappingRate)) {
final Allele referenceAllele = alleleVersions.get(haplotypeGraph.getReferenceHaplotype());
for (final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : result.getLikelihoodReadMap().entrySet()) {
final GATKSAMRecord read = entry.getKey();
final Map<Allele, Double> likelihoods = entry.getValue();
final Double maxLogLk = maxAlleleLogLk.get(read);
if (maxAlleleLogLk == null) continue;
final Double referenceLogLk = likelihoods.get(referenceAllele);
final Double minReferenceLogLk = maxLogLk + log10globalReadMismappingRate;
if (referenceLogLk == null || referenceLogLk < minReferenceLogLk)
likelihoods.put(referenceAllele, minReferenceLogLk);
}
}
}
/** /**
* Calculates path costs for a set of reads. * Calculates path costs for a set of reads.
* <p/> * <p/>
@ -480,16 +435,15 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* </p> * </p>
* *
* @param reads reads to analyze. * @param reads reads to analyze.
* @param mayNeedAdjustment set where to add reads whose likelihood might need adjustment.
* @return never {@code null}. * @return never {@code null}.
*/ */
protected Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> calculatePathCostsByRead( protected Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> calculatePathCostsByRead(
final List<GATKSAMRecord> reads, final Set<GATKSAMRecord> mayNeedAdjustment) { final List<GATKSAMRecord> reads) {
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> result = new HashMap<>(reads.size()); final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> result = new HashMap<>(reads.size());
if (!hasVariation) if (!hasVariation)
return Collections.emptyMap(); return Collections.emptyMap();
for (final GATKSAMRecord r : reads) { for (final GATKSAMRecord r : reads) {
calculatePathCostsByRead(r, mayNeedAdjustment, result); calculatePathCostsByRead(r, result);
} }
return result; return result;
} }
@ -498,10 +452,9 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
* Calculates path cost for a single read. * Calculates path cost for a single read.
* *
* @param read target read. * @param read target read.
* @param mayNeedAdjustment set where to add read whose likelihood might need adjustment.
* @param result map where to add the result. * @param result map where to add the result.
*/ */
private void calculatePathCostsByRead(final GATKSAMRecord read, final Set<GATKSAMRecord> mayNeedAdjustment, private void calculatePathCostsByRead(final GATKSAMRecord read,
final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> result) { final Map<MultiDeBruijnVertex, Set<ReadSegmentCost>> result) {
final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph); final ReadAnchoring anchoring = new ReadAnchoring(read,haplotypeGraph);
@ -510,15 +463,12 @@ public class GraphBasedLikelihoodCalculationEngineInstance {
if (!anchoring.isAnchoredSomewhere()) { if (!anchoring.isAnchoredSomewhere()) {
defaultToRegularPairHMM(anchoring, result); defaultToRegularPairHMM(anchoring, result);
nonAnchoredReads++; nonAnchoredReads++;
return; } else {
}
calculateReadSegmentCosts(anchoring, hmm, result); calculateReadSegmentCosts(anchoring, hmm, result);
if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result); if (!anchoring.isPerfectAnchoring()) danglingEndPathCosts(anchoring, hmm, result);
mayNeedAdjustment.add(read);
anchoredReads++; anchoredReads++;
} }
}
/** /**
* Calculates read vs haplotype likelihoods using the classic PairHMM approach. * Calculates read vs haplotype likelihoods using the classic PairHMM approach.

View File

@ -48,7 +48,9 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriter;
import org.broadinstitute.gatk.utils.commandline.*; import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.CommandLineGATK;
import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
@ -75,11 +77,12 @@ import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState;
import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState;
import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.clipping.ReadClipper;
import org.broadinstitute.gatk.utils.commandline.*;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.gatk.utils.fragments.FragmentCollection; import org.broadinstitute.gatk.utils.fragments.FragmentCollection;
import org.broadinstitute.gatk.utils.fragments.FragmentUtils; import org.broadinstitute.gatk.utils.fragments.FragmentUtils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.gatk.utils.gvcf.GVCFWriter; import org.broadinstitute.gatk.utils.gvcf.GVCFWriter;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
@ -89,12 +92,10 @@ import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter;
import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature;
import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.help.HelpConstants;
import org.broadinstitute.gatk.utils.pairhmm.PairHMM; import org.broadinstitute.gatk.utils.pairhmm.PairHMM;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils; import org.broadinstitute.gatk.utils.sam.ReadUtils;
import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants; import org.broadinstitute.gatk.utils.variant.HomoSapiensConstants;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
@ -932,12 +933,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() ); final Map<String,List<GATKSAMRecord>> reads = splitReadsBySample( regionForGenotyping.getReads() );
// Calculate the likelihoods: CPU intesive part. // Calculate the likelihoods: CPU intesive part.
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,reads); final ReadLikelihoods<Haplotype> readLikelihoods =
likelihoodCalculationEngine.computeReadLikelihoods(assemblyResult,samplesList,reads);
// Realign all the reads to the most likely haplotype for use by the annotations // Realign reads to their best haplotype.
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) { final Map<GATKSAMRecord,GATKSAMRecord> readRealignments = realignReadsToTheirBestHaplotype(readLikelihoods, assemblyResult.getPaddedReferenceLoc());
entry.getValue().realignReadsToMostLikelyHaplotype(haplotypes, assemblyResult.getPaddedReferenceLoc()); readLikelihoods.changeReads(readRealignments);
}
// Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there // Note: we used to subset down at this point to only the "best" haplotypes in all samples for genotyping, but there
// was a bad interaction between that selection and the marginalization that happens over each event when computing // was a bad interaction between that selection and the marginalization that happens over each event when computing
@ -947,7 +948,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods( final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes = genotypingEngine.assignGenotypeLikelihoods(
haplotypes, haplotypes,
stratifiedReadMap, readLikelihoods,
perSampleFilteredReadList, perSampleFilteredReadList,
assemblyResult.getFullReferenceWithPadding(), assemblyResult.getFullReferenceWithPadding(),
assemblyResult.getPaddedReferenceLoc(), assemblyResult.getPaddedReferenceLoc(),
@ -964,7 +965,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
assemblyResult.getPaddedReferenceLoc(), assemblyResult.getPaddedReferenceLoc(),
haplotypes, haplotypes,
calledHaplotypes.getCalledHaplotypes(), calledHaplotypes.getCalledHaplotypes(),
stratifiedReadMap); readLikelihoods);
} }
if( SCAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); } if( SCAC.DEBUG ) { logger.info("----------------------------------------------------------------------------------"); }
@ -982,15 +983,36 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
// output variant containing region. // output variant containing region.
result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(), result.addAll(referenceConfidenceModel.calculateRefConfidence(assemblyResult.getReferenceHaplotype(),
calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping, calledHaplotypes.getCalledHaplotypes(), assemblyResult.getPaddedReferenceLoc(), regionForGenotyping,
stratifiedReadMap, calledHaplotypes.getCalls())); readLikelihoods, calledHaplotypes.getCalls()));
// output right-flanking non-variant section: // output right-flanking non-variant section:
if (trimmingResult.hasRightFlankingRegion()) if (trimmingResult.hasRightFlankingRegion())
result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false)); result.addAll(referenceModelForNoVariation(trimmingResult.nonVariantRightFlankRegion(),false));
return result; return result;
} }
} else { } else
return calledHaplotypes.getCalls(); return calledHaplotypes.getCalls();
} }
/**
* Returns a map with the original read as a key and the realigned read as the value.
* <p>
* Missing keys or equivalent key and value pairs mean that the read was not realigned.
* </p>
* @return never {@code null}
*/
private Map<GATKSAMRecord,GATKSAMRecord> realignReadsToTheirBestHaplotype(final ReadLikelihoods<Haplotype> originalReadLikelihoods, final GenomeLoc paddedReferenceLoc) {
final Collection<ReadLikelihoods<Haplotype>.BestAllele> bestAlleles = originalReadLikelihoods.bestAlleles();
final Map<GATKSAMRecord,GATKSAMRecord> result = new HashMap<>(bestAlleles.size());
for (final ReadLikelihoods<Haplotype>.BestAllele bestAllele : bestAlleles) {
final GATKSAMRecord originalRead = bestAllele.read;
final Haplotype bestHaplotype = bestAllele.allele;
final boolean isInformative = bestAllele.isInformative();
final GATKSAMRecord realignedRead = AlignmentUtils.createReadAlignedToRef(originalRead,bestHaplotype,paddedReferenceLoc.getStart(),isInformative);
result.put(originalRead,realignedRead);
}
return result;
} }
private boolean containsCalls(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes) { private boolean containsCalls(final HaplotypeCallerGenotypingEngine.CalledHaplotypes calledHaplotypes) {
@ -1086,22 +1108,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
* @param region the active region containing reads * @param region the active region containing reads
* @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref * @return a map from sample -> PerReadAlleleLikelihoodMap that maps each read to ref
*/ */
public static Map<String, PerReadAlleleLikelihoodMap> createDummyStratifiedReadMap(final Haplotype refHaplotype, public static ReadLikelihoods<Haplotype> createDummyStratifiedReadMap(final Haplotype refHaplotype,
final List<String> samples, final List<String> samples,
final ActiveRegion region) { final ActiveRegion region) {
final Allele refAllele = Allele.create(refHaplotype, true); return new ReadLikelihoods<>(samples, Collections.singletonList(refHaplotype),
splitReadsBySample(samples, region.getReads()));
final Map<String, PerReadAlleleLikelihoodMap> map = new LinkedHashMap<>(1);
for ( final Map.Entry<String, List<GATKSAMRecord>> entry : splitReadsBySample(samples, region.getReads()).entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
for ( final GATKSAMRecord read : entry.getValue() ) {
likelihoodMap.add(read, refAllele, 0.0);
}
map.put(entry.getKey(), likelihoodMap);
} }
return map;
}
//--------------------------------------------------------------------------------------------------------------- //---------------------------------------------------------------------------------------------------------------
// //

View File

@ -57,8 +57,7 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.collections.DefaultHashMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.haplotype.EventMap; import org.broadinstitute.gatk.utils.haplotype.EventMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes;
@ -74,7 +73,7 @@ import java.util.*;
public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeCallerArgumentCollection> { public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeCallerArgumentCollection> {
private final static List<Allele> NO_CALL = Collections.singletonList(Allele.NO_CALL); private final static List<Allele> NO_CALL = Collections.singletonList(Allele.NO_CALL);
private final static int ALLELE_EXTENSION = 2; private static final int ALLELE_EXTENSION = 2;
private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger; private MergeVariantsAcrossHaplotypes crossHaplotypeEventMerger;
@ -161,17 +160,17 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
* Main entry point of class - given a particular set of haplotypes, samples and reference context, compute * Main entry point of class - given a particular set of haplotypes, samples and reference context, compute
* genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling * genotype likelihoods and assemble into a list of variant contexts and genomic events ready for calling
* *
* The list of samples we're working with is obtained from the haplotypeReadMap * The list of samples we're working with is obtained from the readLikelihoods
* *
* @param haplotypes Haplotypes to assign likelihoods to * @param haplotypes Haplotypes to assign likelihoods to
* @param haplotypeReadMap Map from reads->(haplotypes,likelihoods) * @param readLikelihoods Map from reads->(haplotypes,likelihoods)
* @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods. * @param perSampleFilteredReadList Map from sample to reads that were filtered after assembly and before calculating per-read likelihoods.
* @param ref Reference bytes at active region * @param ref Reference bytes at active region
* @param refLoc Corresponding active region genome location * @param refLoc Corresponding active region genome location
* @param activeRegionWindow Active window * @param activeRegionWindow Active window
* @param genomeLocParser GenomeLocParser * @param genomeLocParser GenomeLocParser
* @param activeAllelesToGenotype Alleles to genotype * @param activeAllelesToGenotype Alleles to genotype
* @param emitReferenceConfidence whether we should add a <NON_REF></NON_REF> alternative allele to the result variation contexts. * @param emitReferenceConfidence whether we should add a &lt;NON_REF&gt; alternative allele to the result variation contexts.
* *
* @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes * @return A CalledHaplotypes object containing a list of VC's with genotyped events and called haplotypes
* *
@ -180,7 +179,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
@Ensures("result != null") @Ensures("result != null")
// TODO - can this be refactored? this is hard to follow! // TODO - can this be refactored? this is hard to follow!
public CalledHaplotypes assignGenotypeLikelihoods( final List<Haplotype> haplotypes, public CalledHaplotypes assignGenotypeLikelihoods( final List<Haplotype> haplotypes,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap, final ReadLikelihoods<Haplotype> readLikelihoods,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList, final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final byte[] ref, final byte[] ref,
final GenomeLoc refLoc, final GenomeLoc refLoc,
@ -191,7 +190,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
final boolean emitReferenceConfidence) { final boolean emitReferenceConfidence) {
// sanity check input arguments // sanity check input arguments
if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes); if (haplotypes == null || haplotypes.isEmpty()) throw new IllegalArgumentException("haplotypes input should be non-empty and non-null, got "+haplotypes);
if (haplotypeReadMap == null || haplotypeReadMap.isEmpty()) throw new IllegalArgumentException("haplotypeReadMap input should be non-empty and non-null, got "+haplotypeReadMap); if (readLikelihoods == null || readLikelihoods.sampleCount() == 0) throw new IllegalArgumentException("readLikelihoods input should be non-empty and non-null, got "+readLikelihoods);
if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got " + Arrays.toString(ref)); if (ref == null || ref.length == 0 ) throw new IllegalArgumentException("ref bytes input should be non-empty and non-null, got " + Arrays.toString(ref));
if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc); if (refLoc == null || refLoc.size() != ref.length) throw new IllegalArgumentException(" refLoc must be non-null and length must match ref bytes, got "+refLoc);
if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow); if (activeRegionWindow == null ) throw new IllegalArgumentException("activeRegionWindow must be non-null, got "+activeRegionWindow);
@ -200,12 +199,11 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
// update the haplotypes so we're ready to call, getting the ordered list of positions on the reference // update the haplotypes so we're ready to call, getting the ordered list of positions on the reference
// that carry events among the haplotypes // that carry events among the haplotypes
final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, haplotypeReadMap, ref, refLoc, activeAllelesToGenotype); final TreeSet<Integer> startPosKeySet = decomposeHaplotypesIntoVariantContexts(haplotypes, readLikelihoods, ref, refLoc, activeAllelesToGenotype);
// Walk along each position in the key set and create each event to be outputted // Walk along each position in the key set and create each event to be outputted
final Set<Haplotype> calledHaplotypes = new HashSet<>(); final Set<Haplotype> calledHaplotypes = new HashSet<>();
final List<VariantContext> returnCalls = new ArrayList<>(); final List<VariantContext> returnCalls = new ArrayList<>();
final Map<String, Double> emptyDownSamplingMap = new DefaultHashMap<>(0.0);
for( final int loc : startPosKeySet ) { for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region
@ -221,7 +219,9 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
// Merge the event to find a common reference representation // Merge the event to find a common reference representation
VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC); final VariantContextBuilder vcb = new VariantContextBuilder(mergedVC);
@ -250,19 +250,22 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); if (logger != null) logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
} }
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap(haplotypeReadMap, alleleMapper, configuration.getSampleContamination(), genomeLocParser, mergedVC); ReadLikelihoods<Allele> readAlleleLikelihoods = readLikelihoods.marginalize(alleleMapper,genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLoc(mergedVC),ALLELE_EXTENSION));
if (configuration.isSampleContaminationPresent())
readAlleleLikelihoods.contaminationDownsampling(configuration.getSampleContamination());
if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap);
final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC ); if (emitReferenceConfidence)
final VariantContext call = calculateGenotypes(null, null, null, null, new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel, false, null); readAlleleLikelihoods.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final GenotypesContext genotypes = calculateGLsForThisEvent( readAlleleLikelihoods, mergedVC );
final VariantContext call = calculateGenotypes(null,null,null,null,new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), calculationModel, false,null);
if( call != null ) { if( call != null ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap_annotations = ( configuration.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, emptyDownSamplingMap, genomeLocParser, null ) );
if (emitReferenceConfidence) addMiscellaneousAllele(alleleReadMap_annotations);
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = addFilteredReadList(genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call, true);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker, stratifiedReadMap, call); readAlleleLikelihoods = prepareReadAlleleLikelihoodsForAnnotation(readLikelihoods, perSampleFilteredReadList,
genomeLocParser, emitReferenceConfidence, alleleMapper, readAlleleLikelihoods, call);
VariantContext annotatedCall = annotationEngine.annotateContextForActiveRegion(tracker,readAlleleLikelihoods, call);
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) if( call.getAlleles().size() != mergedVC.getAlleles().size() )
annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall); annotatedCall = GATKVariantContextUtils.reverseTrimAlleles(annotatedCall);
@ -282,64 +285,92 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
return new CalledHaplotypes(returnCalls, calledHaplotypes); return new CalledHaplotypes(returnCalls, calledHaplotypes);
} }
/** // Builds the read-likelihoods collection to use for annotation considering user arguments and the collection
* Add the <NON_REF> allele // used for genotyping.
* @param stratifiedReadMap target per-read-allele-likelihood-map. private ReadLikelihoods<Allele> prepareReadAlleleLikelihoodsForAnnotation(
*/ final ReadLikelihoods<Haplotype> readHaplotypeLikelihoods,
public static Map<String, PerReadAlleleLikelihoodMap> addMiscellaneousAllele(final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) { final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final Allele miscellanoeusAllele = GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE; final GenomeLocParser genomeLocParser,
for (Map.Entry<String, PerReadAlleleLikelihoodMap> perSample : stratifiedReadMap.entrySet()) { final boolean emitReferenceConfidence,
for (Map.Entry<GATKSAMRecord, Map<Allele, Double>> perRead : perSample.getValue().getLikelihoodReadMap().entrySet()) { final Map<Allele, List<Haplotype>> alleleMapper,
double bestLikelihood = Double.NEGATIVE_INFINITY; final ReadLikelihoods<Allele> readAlleleLikelihoodsForGenotyping,
double secondBestLikelihood = Double.NEGATIVE_INFINITY; final VariantContext call) {
for (Map.Entry<Allele,Double> perAllele : perRead.getValue().entrySet()) {
final double value = perAllele.getValue(); final ReadLikelihoods<Allele> readAlleleLikelihoodsForAnnotations;
if (value > bestLikelihood) { final GenomeLoc loc = genomeLocParser.createGenomeLoc(call);
secondBestLikelihood = bestLikelihood;
bestLikelihood = value; // We can reuse for annotation the likelihood for genotyping as long as there is no contamination filtering
} else if (value < bestLikelihood && value > secondBestLikelihood) { // or the user want to use the contamination filtered set for annotations.
secondBestLikelihood = value; // Otherwise (else part) we need to do it again.
if (configuration.USE_FILTERED_READ_MAP_FOR_ANNOTATIONS || !configuration.isSampleContaminationPresent()) {
readAlleleLikelihoodsForAnnotations = readAlleleLikelihoodsForGenotyping;
readAlleleLikelihoodsForAnnotations.filterToOnlyOverlappingUnclippedReads(loc);
} else {
readAlleleLikelihoodsForAnnotations = readHaplotypeLikelihoods.marginalize(alleleMapper, loc);
if (emitReferenceConfidence)
readAlleleLikelihoodsForAnnotations.addNonReferenceAllele(
GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
} }
// Skim the filtered map based on the location so that we do not add filtered read that are going to be removed
// right after a few lines of code bellow.
final Map<String, List<GATKSAMRecord>> overlappingFilteredReads = overlappingFilteredReads(perSampleFilteredReadList, loc);
readAlleleLikelihoodsForAnnotations.addReads(overlappingFilteredReads,0);
return readAlleleLikelihoodsForAnnotations;
} }
final double miscellanousLikelihood = Double.isInfinite(secondBestLikelihood) ? bestLikelihood : secondBestLikelihood;
perSample.getValue().add(perRead.getKey(),miscellanoeusAllele,miscellanousLikelihood);
private Map<String, List<GATKSAMRecord>> overlappingFilteredReads(final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList, final GenomeLoc loc) {
final Map<String,List<GATKSAMRecord>> overlappingFilteredReads = new HashMap<>(perSampleFilteredReadList.size());
for (final Map.Entry<String,List<GATKSAMRecord>> sampleEntry : perSampleFilteredReadList.entrySet()) {
final List<GATKSAMRecord> originalList = sampleEntry.getValue();
final String sample = sampleEntry.getKey();
if (originalList == null || originalList.size() == 0)
continue;
final List<GATKSAMRecord> newList = new ArrayList<>(originalList.size());
for (final GATKSAMRecord read : originalList) {
if (ReadLikelihoods.unclippedReadOverlapsRegion(read, loc))
newList.add(read);
} }
if (newList.size() == 0)
continue;
overlappingFilteredReads.put(sample,newList);
} }
return stratifiedReadMap; return overlappingFilteredReads;
} }
/** /**
* Go through the haplotypes we assembled, and decompose them into their constituent variant contexts * Go through the haplotypes we assembled, and decompose them into their constituent variant contexts
* *
* @param haplotypes the list of haplotypes we're working with * @param haplotypes the list of haplotypes we're working with
* @param haplotypeReadMap map from samples -> the per read allele likelihoods * @param readLikelihoods map from samples -> the per read allele likelihoods
* @param ref the reference bases (over the same interval as the haplotypes) * @param ref the reference bases (over the same interval as the haplotypes)
* @param refLoc the span of the reference bases * @param refLoc the span of the reference bases
* @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode) * @param activeAllelesToGenotype alleles we want to ensure are scheduled for genotyping (GGA mode)
* @return never {@code null} but perhaps an empty list if there is no variants to report. * @return never {@code null} but perhaps an empty list if there is no variants to report.
*/ */
private TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes, private TreeSet<Integer> decomposeHaplotypesIntoVariantContexts(final List<Haplotype> haplotypes,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap, final ReadLikelihoods readLikelihoods,
final byte[] ref, final byte[] ref,
final GenomeLoc refLoc, final GenomeLoc refLoc,
final List<VariantContext> activeAllelesToGenotype) { final List<VariantContext> activeAllelesToGenotype) {
final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty();
// Using the cigar from each called haplotype to figure out what events need to be written out in a VCF file // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
final TreeSet<Integer> startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, configuration.DEBUG); final TreeSet<Integer> startPosKeySet = EventMap.buildEventMapsForHaplotypes(haplotypes, ref, refLoc, configuration.DEBUG);
if ( in_GGA_mode ) startPosKeySet.clear();
//cleanUpSymbolicUnassembledEvents( haplotypes ); // We don't make symbolic alleles so this isn't needed currently
if ( !in_GGA_mode ) { if ( !in_GGA_mode ) {
// run the event merger if we're not in GGA mode // run the event merger if we're not in GGA mode
if (crossHaplotypeEventMerger == null) if (crossHaplotypeEventMerger == null)
throw new IllegalStateException(" no variant merger was provided at set-up when needed in GGA mode"); throw new IllegalStateException(" no variant merger was provided at set-up when needed in GGA mode");
final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, haplotypeReadMap, startPosKeySet, ref, refLoc); final boolean mergedAnything = crossHaplotypeEventMerger.merge(haplotypes, readLikelihoods, startPosKeySet, ref, refLoc);
if ( mergedAnything ) if ( mergedAnything )
cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events cleanUpSymbolicUnassembledEvents( haplotypes ); // the newly created merged events could be overlapping the unassembled events
} } else {
else { startPosKeySet.clear();
for( final VariantContext compVC : activeAllelesToGenotype ) { for( final VariantContext compVC : activeAllelesToGenotype ) {
startPosKeySet.add( compVC.getStart() ); startPosKeySet.add( compVC.getStart() );
} }
@ -406,65 +437,31 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
/** /**
* For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele * For a particular event described in inputVC, form PL vector for each sample by looking into allele read map and filling likelihood matrix for each allele
* @param alleleReadMap Allele map describing mapping from reads to alleles and corresponding likelihoods * @param readLikelihoods Allele map describing mapping from reads to alleles and corresponding likelihoods
* @param mergedVC Input VC with event to genotype * @param mergedVC Input VC with event to genotype
* @return GenotypesContext object wrapping genotype objects with PLs * @return GenotypesContext object wrapping genotype objects with PLs
*/ */
@Requires({"alleleReadMap!= null", "mergedVC != null"}) @Requires({"readLikelihoods!= null", "mergedVC != null"})
@Ensures("result != null") @Ensures("result != null")
private GenotypesContext calculateGLsForThisEvent( final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap, final VariantContext mergedVC ) { private GenotypesContext calculateGLsForThisEvent( final ReadLikelihoods<Allele> readLikelihoods, final VariantContext mergedVC ) {
final GenotypesContext genotypes = GenotypesContext.create(alleleReadMap.size()); final GenotypesContext genotypes = GenotypesContext.create(readLikelihoods.sampleCount());
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
for( final String sample : alleleReadMap.keySet() ) { for (final String sample : readLikelihoods.samples() ) {
final int numHaplotypes = mergedVC.getAlleles().size(); final int numHaplotypes = mergedVC.getAlleles().size();
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleReadMap, mergedVC.getAlleles(), true); final double[][] haplotypeLikelihoodMatrix = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, mergedVC.getAlleles(), true);
int glIndex = 0; int glIndex = 0;
for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int iii = 0; iii < numHaplotypes; iii++ ) {
for( int jjj = 0; jjj <= iii; jjj++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) {
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
} }
} }
logger.debug(" Likelihoods for sample " + sample + " : " + Arrays.toString(genotypeLikelihoods));
genotypes.add(new GenotypeBuilder(sample).alleles(NO_CALL).PL(genotypeLikelihoods).make()); genotypes.add(new GenotypeBuilder(sample).alleles(NO_CALL).PL(genotypeLikelihoods).make());
} }
return genotypes; return genotypes;
} }
private static Map<String, PerReadAlleleLikelihoodMap> addFilteredReadList(final GenomeLocParser parser,
final Map<String, PerReadAlleleLikelihoodMap> perSampleReadMap,
final Map<String, List<GATKSAMRecord>> perSampleFilteredReadList,
final VariantContext call,
final boolean requireOverlap) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new LinkedHashMap<>();
final GenomeLoc callLoc = ( requireOverlap ? parser.createGenomeLoc(call) : null );
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> sample : perSampleReadMap.entrySet() ) {
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
for( final Map.Entry<GATKSAMRecord,Map<Allele,Double>> mapEntry : sample.getValue().getLikelihoodReadMap().entrySet() ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( !requireOverlap || callLoc.overlapsP(parser.createGenomeLocUnclipped(mapEntry.getKey())) ) {
for( final Map.Entry<Allele,Double> alleleDoubleEntry : mapEntry.getValue().entrySet() ) {
likelihoodMap.add(mapEntry.getKey(), alleleDoubleEntry.getKey(), alleleDoubleEntry.getValue());
}
}
}
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( !requireOverlap || callLoc.overlapsP(parser.createGenomeLocUnclipped(read)) ) {
for( final Allele allele : call.getAlleles() ) {
likelihoodMap.add(read, allele, 0.0);
}
}
}
returnMap.put(sample.getKey(), likelihoodMap);
}
return returnMap;
}
/** /**
* Removes symbolic events from list of haplotypes * Removes symbolic events from list of haplotypes
* @param haplotypes Input/output list of haplotypes, before/after removal * @param haplotypes Input/output list of haplotypes, before/after removal
@ -490,48 +487,6 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
haplotypes.removeAll(haplotypesToRemove); haplotypes.removeAll(haplotypesToRemove);
} }
/**
* The reads, partioned by haplotype, must now be partioned by alleles.
* That is, some alleles are supported by multiple haplotypes and we marginalize over them by taking the max likelihood.
* In addition we subset down to only reads which overlap the alleles (plus a small extension)
* @param haplotypeReadMap Map from reads -> (haplotypes, likelihoods)
* @param alleleMapper Map from alleles -> list of haplotypes which support that allele
* @param perSampleDownsamplingFraction Map from samples -> downsampling fraction
* @param genomeLocParser a genome loc parser
* @param eventsToGenotype the alleles to genotype in a single VariantContext, will be null if we don't want to require overlap
* @return Map from reads -> (alleles, likelihoods)
*/
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final Map<Allele, List<Haplotype>> alleleMapper,
final Map<String,Double> perSampleDownsamplingFraction,
final GenomeLocParser genomeLocParser,
final VariantContext eventsToGenotype) {
final GenomeLoc callLoc = ( eventsToGenotype != null ? genomeLocParser.createGenomeLoc(eventsToGenotype) : null );
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<>();
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final Map.Entry<Allele, List<Haplotype>> alleleMapperEntry : alleleMapper.entrySet() ) { // for each output allele
final List<Haplotype> mappedHaplotypes = alleleMapperEntry.getValue();
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> readEntry : haplotypeReadMapEntry.getValue().getLikelihoodReadMap().entrySet() ) { // for each read
if( eventsToGenotype == null || callLoc.overlapsP(genomeLocParser.createPaddedGenomeLoc(genomeLocParser.createGenomeLocUnclipped(readEntry.getKey()), ALLELE_EXTENSION)) ) { // make sure the read overlaps
double maxLikelihood = Double.NEGATIVE_INFINITY;
for( final Map.Entry<Allele,Double> alleleDoubleEntry : readEntry.getValue().entrySet() ) { // for each input allele
if( mappedHaplotypes.contains( new Haplotype(alleleDoubleEntry.getKey())) ) { // exact match of haplotype base string
maxLikelihood = Math.max( maxLikelihood, alleleDoubleEntry.getValue() );
}
}
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
}
}
}
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(perSampleDownsamplingFraction.get(haplotypeReadMapEntry.getKey())); // perform contamination downsampling
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
}
return alleleReadMap;
}
protected static Map<Allele, List<Haplotype>> createAlleleMapper( final Map<VariantContext, Allele> mergeMap, final Map<Event, List<Haplotype>> eventMap ) { protected static Map<Allele, List<Haplotype>> createAlleleMapper( final Map<VariantContext, Allele> mergeMap, final Map<Event, List<Haplotype>> eventMap ) {
final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<>(); final Map<Allele, List<Haplotype>> alleleMapper = new LinkedHashMap<>();
for( final Map.Entry<VariantContext, Allele> entry : mergeMap.entrySet() ) { for( final Map.Entry<VariantContext, Allele> entry : mergeMap.entrySet() ) {
@ -542,7 +497,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine<HaplotypeC
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
protected static Map<Event, List<Haplotype>> createEventMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes ) { protected static Map<Event, List<Haplotype>> createEventMapper( final int loc, final List<VariantContext> eventsAtThisLoc, final List<Haplotype> haplotypes) {
final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1); final Map<Event, List<Haplotype>> eventMapper = new LinkedHashMap<>(eventsAtThisLoc.size()+1);
final Event refEvent = new Event(null); final Event refEvent = new Event(null);

View File

@ -53,7 +53,7 @@ import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pairhmm.*; import org.broadinstitute.gatk.utils.pairhmm.*;
import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatCovariate; import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatCovariate;
@ -73,6 +73,7 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
private final byte constantGCP; private final byte constantGCP;
private final double log10globalReadMismappingRate; private final double log10globalReadMismappingRate;
private final PairHMM.HMM_IMPLEMENTATION hmmType; private final PairHMM.HMM_IMPLEMENTATION hmmType;
@ -177,40 +178,6 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
pairHMMThreadLocal.get().close(); pairHMMThreadLocal.get().close();
} }
private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(processedRead.getReadBases() ),
SAMUtils.phredToFastq(processedRead.getBaseQualities() ),
SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ),
SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ),
SAMUtils.phredToFastq(constantGCP),
log10l);
}
}
private Map<Allele, Haplotype> createAlleleMap(List<Haplotype> haplotypes){
final int numHaplotypes = haplotypes.size();
final Map<Allele, Haplotype> alleleMap = new LinkedHashMap<>(numHaplotypes);
for ( final Haplotype haplotype : haplotypes ) {
final Allele allele = Allele.create(haplotype, true);
alleleMap.put(allele, haplotype);
}
return alleleMap;
}
private Map<GATKSAMRecord, byte[]> fillGCPArrays(List<GATKSAMRecord> reads){
final Map<GATKSAMRecord, byte []> GCPArrayMap = new LinkedHashMap<>();
for (GATKSAMRecord read: reads){
byte [] GCPArray = new byte[read.getReadBases().length];
Arrays.fill( GCPArray, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
GCPArrayMap.put(read, GCPArray);
}
return GCPArrayMap;
}
private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) {
for( int kkk = 0; kkk < readQuals.length; kkk++ ) { for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
@ -229,9 +196,9 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
* @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding * @return processedReads. A new list of reads, in the same order, whose qualities have been altered by PCR error model and minimal quality thresholding
*/ */
private List<GATKSAMRecord> modifyReadQualities(final List<GATKSAMRecord> reads) { private List<GATKSAMRecord> modifyReadQualities(final List<GATKSAMRecord> reads) {
List<GATKSAMRecord> processedReads = new LinkedList<>(); final List<GATKSAMRecord> result = new ArrayList<>(reads.size());
for ( GATKSAMRecord read : reads ) {
for (final GATKSAMRecord read : reads) {
final byte[] readBases = read.getReadBases(); final byte[] readBases = read.getReadBases();
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
@ -244,71 +211,9 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
// Create a new copy of the read and sets its base qualities to the modified versions. // Create a new copy of the read and sets its base qualities to the modified versions.
// Pack this into a new list for return // Pack this into a new list for return
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals); result.add(GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, readInsQuals, readDelQuals));
processedReads.add(processedRead);
} }
return processedReads; return result;
}
/**
* Post-processing of the read/allele likelihoods.
*
* We send quality-capped reads to the pairHMM for evaluation, and it returns a map containing these capped reads.
* We wish to return a map containing the original, unmodified reads.
*
* At the same time, we want to effectively set a lower cap on the reference score, based on the global mis-mapping rate.
* This protects us from the case where the assembly has produced haplotypes
* that are very divergent from reference, but are supported by only one read. In effect
* we capping how badly scoring the reference can be for any read by the chance that the read
* itself just doesn't belong here
*
* @param perReadAlleleLikelihoodMap the original map returned by the PairHMM. Contains the processed reads, the haplotype Alleles, and their log10ls
* @param reads Our original, unmodified reads
* @param processedReads Reads whose minimum base,insertion,deletion qualities have been capped; these were actually used to derive log10ls
* @param alleleHaplotypeMap The map associating the Allele and Haplotype versions of each haplotype
*
* @return processedReadAlleleLikelihoodMap; a new PRALM containing the original reads, and their haplotype log10ls including capped reference log10ls
*/
private PerReadAlleleLikelihoodMap capReferenceHaplotypeLikelihoods(PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, List<GATKSAMRecord> reads, List<GATKSAMRecord> processedReads, Map<Allele, Haplotype> alleleHaplotypeMap){
// a new read/allele map, to contain the uncapped reads, haplotypes, and potentially the capped reference log10ls
final PerReadAlleleLikelihoodMap processedReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
final int numReads = reads.size();
for (int readIndex = 0; readIndex < numReads; readIndex++) {
// Get the original and quality-modified read from their respective lists
// Note that this requires both lists to have reads in the same order
final GATKSAMRecord originalRead = reads.get(readIndex);
final GATKSAMRecord processedRead = processedReads.get(readIndex);
double bestNonReflog10L = Double.NEGATIVE_INFINITY;
for ( final Allele allele : alleleHaplotypeMap.keySet() ) {
final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele);
final Haplotype haplotype = alleleHaplotypeMap.get(allele);
if ( haplotype.isNonReference() )
bestNonReflog10L = Math.max(bestNonReflog10L, log10l);
writeDebugLikelihoods(processedRead, haplotype, log10l);
// add the ORIGINAL (non-capped) read to the final map, along with the current haplotype and associated log10l
processedReadAlleleLikelihoodMap.add(originalRead, allele, log10l);
}
// ensure that any haplotype is no worse than the best non-ref haplotype minus the global
// mismapping rate. This protects us from the case where the assembly has produced haplotypes
// that are very divergent from reference, but are supported by only one read. In effect
// we capping how badly scoring any haplotype can be for any read by the chance that the read
// itself just doesn't belong here
final double worstAllowedLog10l = bestNonReflog10L + log10globalReadMismappingRate;
for ( final Allele allele : alleleHaplotypeMap.keySet() ) {
final double log10l = perReadAlleleLikelihoodMap.getLikelihoodAssociatedWithReadAndAllele(processedRead, allele);
if( log10l < worstAllowedLog10l ) {
processedReadAlleleLikelihoodMap.add(originalRead, allele, worstAllowedLog10l);
}
}
}
return processedReadAlleleLikelihoodMap;
} }
/** /**
@ -343,85 +248,120 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
pairHMMThreadLocal.get().finalizeRegion(); pairHMMThreadLocal.get().finalizeRegion();
} }
@Override @Override
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final Map<String, List<GATKSAMRecord>> perSampleReadList ) { public ReadLikelihoods<Haplotype> computeReadLikelihoods( final AssemblyResultSet assemblyResultSet, final List<String> samples, final Map<String, List<GATKSAMRecord>> perSampleReadList ) {
final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList(); final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList();
// configure the HMM // configure the HMM
initializePairHMM(haplotypes, perSampleReadList); initializePairHMM(haplotypes, perSampleReadList);
// Add likelihoods for each sample's reads to our stratifiedReadMap // Add likelihoods for each sample's reads to our result
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = new LinkedHashMap<>(); final ReadLikelihoods<Haplotype> result = new ReadLikelihoods<>(samples, haplotypes, perSampleReadList);
for( final Map.Entry<String, List<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) { final int sampleCount = result.sampleCount();
// evaluate the likelihood of the reads given those haplotypes for (int s = 0; s < sampleCount; s++) {
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); final ReadLikelihoods.Matrix<Haplotype> sampleLikelihoods = result.sampleMatrix(s);
computeReadLikelihoods(sampleLikelihoods);
map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
stratifiedReadMap.put(sampleEntry.getKey(), map);
} }
//Used mostly by the JNI implementation(s) to free arrays
result.normalizeLikelihoods(false, log10globalReadMismappingRate);
result.filterPoorlyModeledReads(EXPECTED_ERROR_RATE_PER_BASE);
finalizePairHMM(); finalizePairHMM();
return result;
return stratifiedReadMap;
} }
private void computeReadLikelihoods( final ReadLikelihoods.Matrix<Haplotype> likelihoods) {
private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List<Haplotype> haplotypes, final List<GATKSAMRecord> reads) {
// Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities // Modify the read qualities by applying the PCR error model and capping the minimum base,insertion,deletion qualities
List<GATKSAMRecord> processedReads = modifyReadQualities(reads); final List<GATKSAMRecord> processedReads = modifyReadQualities(likelihoods.reads());
// Get alleles corresponding to our haplotypees
Map<Allele, Haplotype> alleleHaplotypeMap = createAlleleMap(haplotypes);
// Get an array containing the constantGCP for each read in our modified read list
Map<GATKSAMRecord,byte[]> GCPArrayMap = fillGCPArrays(processedReads);
final Map<GATKSAMRecord,byte[]> gapContinuationPenalties = buildGapContinuationPenalties(processedReads,constantGCP);
// Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype // Run the PairHMM to calculate the log10 likelihood of each (processed) reads' arising from each haplotype
PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = pairHMMThreadLocal.get().computeLikelihoods(processedReads, alleleHaplotypeMap, GCPArrayMap); pairHMMThreadLocal.get().computeLikelihoods(likelihoods,processedReads,gapContinuationPenalties);
// Generate a new map containing the original, unmodified reads, and with minimal reference haplotype log10ls determined from the global mis-mapping rate if (WRITE_LIKELIHOODS_TO_FILE)
writeDebugLikelihoods(likelihoods);
}
return capReferenceHaplotypeLikelihoods(perReadAlleleLikelihoodMap, reads, processedReads, alleleHaplotypeMap); private Map<GATKSAMRecord, byte[]> buildGapContinuationPenalties(final List<GATKSAMRecord> processedReads, final byte gcp) {
final Map<GATKSAMRecord,byte[]> result = new HashMap<>(processedReads.size());
for (final GATKSAMRecord read : processedReads) {
final byte[] readGcpArray = new byte[read.getReadLength()];
Arrays.fill(readGcpArray,gcp);
result.put(read,readGcpArray);
}
return result;
}
private void writeDebugLikelihoods(final ReadLikelihoods.Matrix<Haplotype> likelihoods) {
final List<GATKSAMRecord> reads = likelihoods.reads();
final List<Haplotype> haplotypes = likelihoods.alleles();
final int haplotypeCount = haplotypes.size();
final int readCount = reads.size();
for (int r = 0; r < readCount; r++)
for (int a = 0; a < haplotypeCount; a++)
writeDebugLikelihoods(reads.get(r),haplotypes.get(a),likelihoods.get(a,r));
likelihoodsStream.flush();
}
private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(processedRead.getReadBases() ),
SAMUtils.phredToFastq(processedRead.getBaseQualities()),
SAMUtils.phredToFastq(processedRead.getBaseInsertionQualities() ),
SAMUtils.phredToFastq(processedRead.getBaseDeletionQualities() ),
SAMUtils.phredToFastq(constantGCP),
log10l);
} }
@Requires({"alleleOrdering.size() > 0"}) @Requires({"alleleOrdering.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
@Deprecated
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, public static double[][] computeDiploidHaplotypeLikelihoods( final String sample,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final ReadLikelihoods readLikelihoods,
final List<Allele> alleleOrdering, final List alleleOrdering,
final boolean normalize ) { final boolean normalize ) {
return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), stratifiedReadMap, alleleOrdering, normalize); return computeDiploidHaplotypeLikelihoods(Collections.singleton(sample), readLikelihoods, alleleOrdering, normalize);
} }
@Requires({"alleleOrdering.size() > 0"}) @Requires({"alleleOrdering.size() > 0"})
@Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"}) @Ensures({"result.length == result[0].length", "result.length == alleleOrdering.size()"})
public static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples, @Deprecated
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, private static double[][] computeDiploidHaplotypeLikelihoods( final Set<String> samples,
final List<Allele> alleleOrdering, final ReadLikelihoods readLikelihoods,
final List alleleOrdering,
final boolean normalize) { final boolean normalize) {
final int numHaplotypes = alleleOrdering.size(); final int numHaplotypes = alleleOrdering.size();
final int[] alleleIndices = new int[alleleOrdering.size()];
final ListIterator alleleIterator = alleleOrdering.listIterator();
int nextAlleleIndex = 0;
while (alleleIterator.hasNext())
if ((alleleIndices[nextAlleleIndex++] = readLikelihoods.alleleIndex((Allele) alleleIterator.next())) == -1)
throw new IllegalArgumentException("allele " + alleleIterator.previous() + " not found in likelihood collection ");
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
for( int iii = 0; iii < numHaplotypes; iii++ ) {
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
}
// compute the diploid haplotype likelihoods // compute the diploid haplotype likelihoods
for(final String sample : samples) {
final int sampleIndex = readLikelihoods.sampleIndex(sample);
if (sampleIndex == -1)
throw new IllegalArgumentException("the sample provided is not in the likelihood collection");
final ReadLikelihoods.Matrix sampleLikelihoods = readLikelihoods.sampleMatrix(sampleIndex);
final int sampleReadCount = readLikelihoods.sampleReadCount(sampleIndex);
for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int iii = 0; iii < numHaplotypes; iii++ ) {
final Allele iii_allele = alleleOrdering.get(iii); final int iii_allele = alleleIndices[iii];
for( int jjj = 0; jjj <= iii; jjj++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) {
final Allele jjj_allele = alleleOrdering.get(jjj); final int jjj_allele = alleleIndices[jjj];
double haplotypeLikelihood = 0.0; double haplotypeLikelihood = 0.0;
for( final String sample : samples ) { for (int r = 0; r < sampleReadCount; r++) {
for( final Map.Entry<GATKSAMRecord, Map<Allele,Double>> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { final double value = MathUtils.approximateLog10SumLog10(sampleLikelihoods.get(iii_allele,r),
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) sampleLikelihoods.get(jjj_allele,r)) + MathUtils.LOG_ONE_HALF;
// First term is approximated by Jacobian log with table lookup. haplotypeLikelihood += value;
haplotypeLikelihood += ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF );
} }
haplotypeLikelihoodMatrix[iii][jjj] += haplotypeLikelihood;
} }
haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood;
} }
} }
@ -431,6 +371,7 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
@Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"}) @Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"})
@Deprecated
protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) { protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) {
final int numHaplotypes = likelihoodMatrix.length; final int numHaplotypes = likelihoodMatrix.length;
double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
@ -450,131 +391,6 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula
return likelihoodMatrix; return likelihoodMatrix;
} }
// --------------------------------------------------------------------------------
//
// System to compute the best N haplotypes for genotyping
//
// --------------------------------------------------------------------------------
//
// /**
// * Helper function for selectBestHaplotypesFromEachSample that updates the score of haplotype haplotypeAsAllele
// * @param map an annoying map object that moves us between the allele and haplotype representation
// * @param haplotypeAsAllele the allele version of the haplotype
// * @return the haplotype version, with its score incremented by 1 if its non-reference
// */
// private Haplotype updateSelectHaplotype(final Map<Allele, Haplotype> map, final Allele haplotypeAsAllele) {
// final Haplotype h = map.get(haplotypeAsAllele); // TODO -- fixme when haplotypes are properly generic
// if ( h.isNonReference() ) h.setScore(h.getScore() + 1); // ref is already at max value
// return h;
// }
//
// /**
// * Take the best N haplotypes and return them as a list
// *
// * Only considers the haplotypes selectedHaplotypes that were actually selected by at least one sample
// * as it's preferred haplotype. Takes the best N haplotypes from selectedHaplotypes in decreasing
// * order of score (so higher score haplotypes are preferred). The N we take is determined by
// *
// * N = min(2 * nSamples + 1, maxNumHaplotypesInPopulation)
// *
// * where 2 * nSamples is the number of chromosomes in 2 samples including the reference, and our workload is
// * bounded by maxNumHaplotypesInPopulation as that number can grow without bound
// *
// * @param selectedHaplotypes a non-null set of haplotypes with scores >= 1
// * @param nSamples the number of samples used to select the haplotypes
// * @param maxNumHaplotypesInPopulation the maximum number of haplotypes we're allowed to take, regardless of nSamples
// * @return a list of N or fewer haplotypes, with the reference haplotype first
// */
// private List<Haplotype> selectBestHaplotypesAccordingToScore(final Set<Haplotype> selectedHaplotypes, final int nSamples, final int maxNumHaplotypesInPopulation) {
// final List<Haplotype> selectedHaplotypesList = new ArrayList<>(selectedHaplotypes);
// Collections.sort(selectedHaplotypesList, new HaplotypeScoreComparator());
// final int numChromosomesInSamplesPlusRef = 2 * nSamples + 1;
// final int haplotypesToKeep = Math.min(numChromosomesInSamplesPlusRef, maxNumHaplotypesInPopulation);
// final List<Haplotype> bestHaplotypes = selectedHaplotypesList.size() <= haplotypesToKeep ? selectedHaplotypesList : selectedHaplotypesList.subList(0, haplotypesToKeep);
// if ( bestHaplotypes.get(0).isNonReference()) throw new IllegalStateException("BUG: reference haplotype should be first in list");
// return bestHaplotypes;
// }
//
// /**
// * Select the best haplotypes for genotyping the samples in stratifiedReadMap
// *
// * Selects these haplotypes by counting up how often each haplotype is selected as one of the most likely
// * haplotypes per sample. What this means is that each sample computes the diploid genotype likelihoods for
// * all possible pairs of haplotypes, and the pair with the highest likelihood has each haplotype each get
// * one extra count for each haplotype (so hom-var haplotypes get two counts). After performing this calculation
// * the best N haplotypes are selected (@see #selectBestHaplotypesAccordingToScore) and a list of the
// * haplotypes in order of score are returned, ensuring that at least one of the haplotypes is reference.
// *
// * @param haplotypes a list of all haplotypes we're considering
// * @param stratifiedReadMap a map from sample -> read likelihoods per haplotype
// * @param maxNumHaplotypesInPopulation the max. number of haplotypes we can select from haplotypes
// * @return a list of selected haplotypes with size <= maxNumHaplotypesInPopulation
// */
// public List<Haplotype> selectBestHaplotypesFromEachSample(final List<Haplotype> haplotypes, final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final int maxNumHaplotypesInPopulation) {
// if ( haplotypes.size() < 2 ) throw new IllegalArgumentException("Must have at least 2 haplotypes to consider but only have " + haplotypes);
//
// if ( haplotypes.size() == 2 ) return haplotypes; // fast path -- we'll always want to use 2 haplotypes
//
// // all of the haplotypes that at least one sample called as one of the most likely
// final Set<Haplotype> selectedHaplotypes = new HashSet<>();
// selectedHaplotypes.add(findReferenceHaplotype(haplotypes)); // ref is always one of the selected
//
// // our annoying map from allele -> haplotype
// final Map<Allele, Haplotype> allele2Haplotype = new HashMap<>();
// for ( final Haplotype h : haplotypes ) {
// h.setScore(h.isReference() ? Double.MAX_VALUE : 0.0); // set all of the scores to 0 (lowest value) for all non-ref haplotypes
// allele2Haplotype.put(Allele.create(h, h.isReference()), h);
// }
//
// // for each sample, compute the most likely pair of haplotypes
// for ( final Map.Entry<String, PerReadAlleleLikelihoodMap> entry : stratifiedReadMap.entrySet() ) {
// // get the two most likely haplotypes under a diploid model for this sample
// final MostLikelyAllele mla = entry.getValue().getMostLikelyDiploidAlleles();
//
// if ( mla != null ) { // there was something to evaluate in this sample
// // note that there must be at least 2 haplotypes
// final Haplotype best = updateSelectHaplotype(allele2Haplotype, mla.getMostLikelyAllele());
// final Haplotype second = updateSelectHaplotype(allele2Haplotype, mla.getSecondMostLikelyAllele());
//
//// if ( DEBUG ) {
//// logger.info("Chose haplotypes " + best + " " + best.getCigar() + " and " + second + " " + second.getCigar() + " for sample " + entry.getKey());
//// }
//
// // add these two haplotypes to the set of haplotypes that have been selected
// selectedHaplotypes.add(best);
// selectedHaplotypes.add(second);
//
// // we've already selected all of our haplotypes, and we don't need to prune them down
// if ( selectedHaplotypes.size() == haplotypes.size() && haplotypes.size() < maxNumHaplotypesInPopulation )
// break;
// }
// }
//
// // take the best N haplotypes forward, in order of the number of samples that choose them
// final int nSamples = stratifiedReadMap.size();
// final List<Haplotype> bestHaplotypes = selectBestHaplotypesAccordingToScore(selectedHaplotypes, nSamples, maxNumHaplotypesInPopulation);
//
// if ( DEBUG ) {
// logger.info("Chose " + (bestHaplotypes.size() - 1) + " alternate haplotypes to genotype in all samples.");
// for ( final Haplotype h : bestHaplotypes ) {
// logger.info("\tHaplotype " + h.getCigar() + " selected for further genotyping" + (h.isNonReference() ? " found " + (int)h.getScore() + " haplotypes" : " as ref haplotype"));
// }
// }
// return bestHaplotypes;
// }
//
// /**
// * Find the haplotype that isRef(), or @throw ReviewedGATKException if one isn't found
// * @param haplotypes non-null list of haplotypes
// * @return the reference haplotype
// */
// private static Haplotype findReferenceHaplotype( final List<Haplotype> haplotypes ) {
// for( final Haplotype h : haplotypes ) {
// if( h.isReference() ) return h;
// }
// throw new ReviewedGATKException( "No reference haplotype found in the list of haplotypes!" );
// }
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
// //
// Experimental attempts at PCR error rate modeling // Experimental attempts at PCR error rate modeling

View File

@ -43,10 +43,11 @@
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/ */
package org.broadinstitute.gatk.tools.walkers.haplotypecaller; package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Allele;
@ -62,21 +63,25 @@ import java.util.Random;
public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine { public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine {
@Override @Override
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(final AssemblyResultSet assemblyResultSet, final Map<String, List<GATKSAMRecord>> reads) { public ReadLikelihoods computeReadLikelihoods(final AssemblyResultSet assemblyResultSet,
final List<String> samples,
final Map<String, List<GATKSAMRecord>> reads) {
final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList(); final List<Haplotype> haplotypes = assemblyResultSet.getHaplotypeList();
final Map<String,PerReadAlleleLikelihoodMap> result = new HashMap<>(reads.size()); final ReadLikelihoods result = new ReadLikelihoods(samples, haplotypes, reads);
final Map<Haplotype,Allele> alleles = new HashMap<>(haplotypes.size()); final Map<Haplotype,Allele> alleles = new HashMap<>(haplotypes.size());
for (final Haplotype haplotype : haplotypes) for (final Haplotype haplotype : haplotypes)
alleles.put(haplotype,Allele.create(haplotype,false)); alleles.put(haplotype,Allele.create(haplotype,false));
final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
for (final String sample : reads.keySet()) { final int sampleCount = samples.size();
final PerReadAlleleLikelihoodMap pralm = new PerReadAlleleLikelihoodMap(); final int alleleCount = alleles.size();
for (final GATKSAMRecord read : reads.get(sample)) for (int i = 0; i < sampleCount; i++) {
for (final Haplotype haplotype : haplotypes ) final List<GATKSAMRecord> sampleReads = result.sampleReads(i);
pralm.add(read,alleles.get(haplotype),-Math.abs(rnd.nextDouble())); final int readCount = sampleReads.size();
result.put(sample,pralm); final ReadLikelihoods.Matrix<Haplotype> sampleLikelihoods = result.sampleMatrix(i);
for (int a = 0; a < alleleCount; a++)
for (int r = 0; r < readCount; r++)
sampleLikelihoods.set(a,r,-Math.abs(rnd.nextDouble()));
} }
return result; return result;
} }

View File

@ -46,7 +46,8 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller; package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.util.List; import java.util.List;
@ -87,7 +88,7 @@ public interface ReadLikelihoodCalculationEngine {
* @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}. * @return never {@code null}, and with at least one entry for input sample (keys in {@code perSampleReadList}.
* The value maps can be potentially empty though. * The value maps can be potentially empty though.
*/ */
public Map<String, PerReadAlleleLikelihoodMap> computeReadLikelihoods(AssemblyResultSet assemblyResultSet, public ReadLikelihoods<Haplotype> computeReadLikelihoods(AssemblyResultSet assemblyResultSet, List<String> samples,
Map<String, List<GATKSAMRecord>> perSampleReadList); Map<String, List<GATKSAMRecord>> perSampleReadList);
public void close(); public void close();

View File

@ -47,13 +47,16 @@
package org.broadinstitute.gatk.tools.walkers.haplotypecaller; package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.*; import htsjdk.samtools.*;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.PileupElement;
@ -62,9 +65,6 @@ import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFSimpleHeaderLine;
import java.io.File; import java.io.File;
import java.util.*; import java.util.*;
@ -86,6 +86,7 @@ public class ReferenceConfidenceModel {
private final GenomeLocParser genomeLocParser; private final GenomeLocParser genomeLocParser;
private final Set<String> samples; private final Set<String> samples;
private final SAMFileHeader header; // TODO -- really shouldn't depend on this
private final int indelInformativeDepthIndelSize; private final int indelInformativeDepthIndelSize;
private final static boolean WRITE_DEBUGGING_BAM = false; private final static boolean WRITE_DEBUGGING_BAM = false;
@ -113,6 +114,7 @@ public class ReferenceConfidenceModel {
this.genomeLocParser = genomeLocParser; this.genomeLocParser = genomeLocParser;
this.samples = samples; this.samples = samples;
this.header = header;
this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize; this.indelInformativeDepthIndelSize = indelInformativeDepthIndelSize;
if ( WRITE_DEBUGGING_BAM ) { if ( WRITE_DEBUGGING_BAM ) {
@ -156,10 +158,10 @@ public class ReferenceConfidenceModel {
* *
* @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc() * @param refHaplotype the reference haplotype, used to get the reference bases across activeRegion.getLoc()
* @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the * @param calledHaplotypes a list of haplotypes that segregate in this region, for realignment of the reads in the
* stratifiedReadMap, corresponding to each reads best haplotype. Must contain the refHaplotype. * readLikelihoods, corresponding to each reads best haplotype. Must contain the refHaplotype.
* @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc()) * @param paddedReferenceLoc the location of refHaplotype (which might be larger than activeRegion.getLoc())
* @param activeRegion the active region we want to get the reference confidence over * @param activeRegion the active region we want to get the reference confidence over
* @param stratifiedReadMap a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes * @param readLikelihoods a map from a single sample to its PerReadAlleleLikelihoodMap for each haplotype in calledHaplotypes
* @param variantCalls calls made in this region. The return result will contain any variant call in this list in the * @param variantCalls calls made in this region. The return result will contain any variant call in this list in the
* correct order by genomic position, and any variant in this list will stop us emitting a ref confidence * correct order by genomic position, and any variant in this list will stop us emitting a ref confidence
* under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span) * under any position it covers (for snps and insertions that is 1 bp, but for deletions its the entire ref span)
@ -170,22 +172,22 @@ public class ReferenceConfidenceModel {
final Collection<Haplotype> calledHaplotypes, final Collection<Haplotype> calledHaplotypes,
final GenomeLoc paddedReferenceLoc, final GenomeLoc paddedReferenceLoc,
final ActiveRegion activeRegion, final ActiveRegion activeRegion,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap, final ReadLikelihoods<Haplotype> readLikelihoods,
final List<VariantContext> variantCalls) { final List<VariantContext> variantCalls) {
if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null");
if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype");
if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null");
if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null");
if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null");
if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); if ( readLikelihoods.sampleCount() != 1 ) throw new IllegalArgumentException("readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.sampleCount());
if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different"); if ( refHaplotype.length() != activeRegion.getExtendedLoc().size() ) throw new IllegalArgumentException("refHaplotype " + refHaplotype.length() + " and activeRegion location size " + activeRegion.getLocation().size() + " are different");
final GenomeLoc refSpan = activeRegion.getLocation(); final GenomeLoc refSpan = activeRegion.getLocation();
final List<ReadBackedPileup> refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, stratifiedReadMap); final List<ReadBackedPileup> refPileups = getPileupsOverReference(refHaplotype, calledHaplotypes, paddedReferenceLoc, activeRegion, refSpan, readLikelihoods);
final byte[] ref = refHaplotype.getBases(); final byte[] ref = refHaplotype.getBases();
final List<VariantContext> results = new ArrayList<>(refSpan.size()); final List<VariantContext> results = new ArrayList<>(refSpan.size());
final String sampleName = stratifiedReadMap.keySet().iterator().next(); final String sampleName = readLikelihoods.sample(0);
final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart(); final int globalRefOffset = refSpan.getStart() - activeRegion.getExtendedLoc().getStart();
for ( final ReadBackedPileup pileup : refPileups ) { for ( final ReadBackedPileup pileup : refPileups ) {
@ -311,15 +313,15 @@ public class ReferenceConfidenceModel {
final GenomeLoc paddedReferenceLoc, final GenomeLoc paddedReferenceLoc,
final ActiveRegion activeRegion, final ActiveRegion activeRegion,
final GenomeLoc activeRegionSpan, final GenomeLoc activeRegionSpan,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) { final ReadLikelihoods<Haplotype> readLikelihoods) {
if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null"); if ( refHaplotype == null ) throw new IllegalArgumentException("refHaplotype cannot be null");
if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null"); if ( calledHaplotypes == null ) throw new IllegalArgumentException("calledHaplotypes cannot be null");
if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype"); if ( !calledHaplotypes.contains(refHaplotype)) throw new IllegalArgumentException("calledHaplotypes must contain the refHaplotype");
if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null"); if ( paddedReferenceLoc == null ) throw new IllegalArgumentException("paddedReferenceLoc cannot be null");
if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null"); if ( activeRegion == null ) throw new IllegalArgumentException("activeRegion cannot be null");
if ( stratifiedReadMap == null ) throw new IllegalArgumentException("stratifiedReadMap cannot be null"); if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null");
if ( stratifiedReadMap.size() != 1 ) throw new IllegalArgumentException("stratifiedReadMap must contain exactly one sample but it contained " + stratifiedReadMap.size()); if ( readLikelihoods.sampleCount() != 1 ) throw new IllegalArgumentException("readLikelihoods must contain exactly one sample but it contained " + readLikelihoods.sampleCount());
final List<GATKSAMRecord> reads = activeRegion.getReads(); final List<GATKSAMRecord> reads = activeRegion.getReads();

View File

@ -47,11 +47,13 @@
package org.broadinstitute.gatk.tools.walkers.indels; package org.broadinstitute.gatk.tools.walkers.indels;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.clipping.ReadClipper;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pairhmm.ArrayLoglessPairHMM; import org.broadinstitute.gatk.utils.pairhmm.ArrayLoglessPairHMM;
import org.broadinstitute.gatk.utils.pairhmm.Log10PairHMM; import org.broadinstitute.gatk.utils.pairhmm.Log10PairHMM;
@ -61,12 +63,8 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.sam.ReadUtils; import org.broadinstitute.gatk.utils.sam.ReadUtils;
import htsjdk.variant.variantcontext.Allele;
import java.util.Arrays; import java.util.*;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.Map;
public class PairHMMIndelErrorModel { public class PairHMMIndelErrorModel {
@ -297,10 +295,8 @@ public class PairHMMIndelErrorModel {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) {
final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()];
final LinkedList<GATKSAMRecord> readList = new LinkedList<>();
final Map<GATKSAMRecord, byte[]> readGCPArrayMap = new LinkedHashMap<>();
int readIdx=0; int readIdx=0;
for (PileupElement p: pileup) { for (final PileupElement p: pileup) {
// check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { if (perReadAlleleLikelihoodMap.containsPileupElement(p)) {
@ -439,28 +435,31 @@ public class PairHMMIndelErrorModel {
// Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM // Create a new read based on the current one, but with trimmed bases/quals, for use in the HMM
final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities); final GATKSAMRecord processedRead = GATKSAMRecord.createQualityModifiedRead(read, readBases, readQuals, baseInsertionQualities, baseDeletionQualities);
readList.add(processedRead);
// Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM // Pack the shortened read and its associated gap-continuation-penalty array into a map, as required by PairHMM
readGCPArrayMap.put(processedRead,contextLogGapContinuationProbabilities); final Map<GATKSAMRecord,byte[]> readGCPArrayMap = Collections.singletonMap(processedRead,contextLogGapContinuationProbabilities);
// Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations // Create a map of alleles to a new set of haplotypes, whose bases have been trimmed to the appropriate genomic locations
final Map<Allele, Haplotype> trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref); final Map<Allele, Haplotype> trimmedHaplotypeMap = trimHaplotypes(haplotypeMap, startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, ref);
// Apparently more than one allele can map to the same haplotype after trimming
final Set<Haplotype> distinctHaplotypesSet = new LinkedHashSet<>(trimmedHaplotypeMap.values());
final List<Haplotype> distinctHaplotypesList = Arrays.asList(distinctHaplotypesSet.toArray(new Haplotype[distinctHaplotypesSet.size()]));
// Get the likelihoods for our clipped read against each of our trimmed haplotypes. // Get the likelihoods for our clipped read against each of our trimmed haplotypes.
final PerReadAlleleLikelihoodMap singleReadRawLikelihoods = pairHMM.computeLikelihoods(readList, trimmedHaplotypeMap, readGCPArrayMap); final ReadLikelihoods<Haplotype> rl = new ReadLikelihoods<>(
Collections.singletonList("DUMMY_SAMPLE"),distinctHaplotypesList,Collections.singletonMap("DUMMY_SAMPLE",Collections.singletonList(processedRead)));
final ReadLikelihoods.Matrix<Haplotype> dummySampleLikelihoods = rl.sampleMatrix(0);
pairHMM.computeLikelihoods(rl.sampleMatrix(0), Collections.singletonList(processedRead), readGCPArrayMap);
// Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array // Pack the original pilup element, each allele, and each associated log10 likelihood into a final map, and add each likelihood to the array
for (Allele a: trimmedHaplotypeMap.keySet()){ for (final Allele a: trimmedHaplotypeMap.keySet()){
double readLikelihood = singleReadRawLikelihoods.getLikelihoodAssociatedWithReadAndAllele(processedRead, a); final Haplotype h = trimmedHaplotypeMap.get(a);
perReadAlleleLikelihoodMap.add(p, a, readLikelihood); final int hIndex = rl.alleleIndex(h);
final double readLikelihood = dummySampleLikelihoods.get(hIndex,0);
readLikelihoods[readIdx][j++] = readLikelihood; readLikelihoods[readIdx][j++] = readLikelihood;
perReadAlleleLikelihoodMap.add(p,a,readLikelihood);
} }
// The readList for sending to the HMM should only ever contain 1 read, as each must be clipped individually
readList.remove(processedRead);
// The same is true for the read/GCP-array map
readGCPArrayMap.remove(processedRead);
} }
} }
readIdx++; readIdx++;

View File

@ -47,11 +47,10 @@
package org.broadinstitute.gatk.utils.haplotype; package org.broadinstitute.gatk.utils.haplotype;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.VariantContext;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import java.util.*; import java.util.*;
@ -66,7 +65,7 @@ import java.util.*;
*/ */
public class HaplotypeLDCalculator { public class HaplotypeLDCalculator {
private final List<Haplotype> haplotypes; private final List<Haplotype> haplotypes;
private final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap; private final ReadLikelihoods<Haplotype> readLikelihoods;
private List<Map<Haplotype, Double>> haplotypeLikelihoodsPerSample = null; private List<Map<Haplotype, Double>> haplotypeLikelihoodsPerSample = null;
// linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1]
@ -75,14 +74,15 @@ public class HaplotypeLDCalculator {
/** /**
* For testing * For testing
*/ */
@SuppressWarnings("unchecked")
protected HaplotypeLDCalculator() { protected HaplotypeLDCalculator() {
haplotypes = Collections.emptyList(); haplotypes = Collections.emptyList();
haplotypeReadMap = Collections.emptyMap(); readLikelihoods = new ReadLikelihoods<>((List<String>)Collections.EMPTY_LIST, (List<Haplotype>)Collections.EMPTY_LIST, Collections.EMPTY_MAP);
} }
public HaplotypeLDCalculator(List<Haplotype> haplotypes, Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap) { public HaplotypeLDCalculator(final List<Haplotype> haplotypes, final ReadLikelihoods<Haplotype> haplotypeReadMap) {
this.haplotypes = haplotypes; this.haplotypes = haplotypes;
this.haplotypeReadMap = haplotypeReadMap; this.readLikelihoods = haplotypeReadMap;
} }
/** /**
@ -94,13 +94,13 @@ public class HaplotypeLDCalculator {
private void buildHaplotypeLikelihoodsPerSampleIfNecessary() { private void buildHaplotypeLikelihoodsPerSampleIfNecessary() {
if ( haplotypeLikelihoodsPerSample == null ) { if ( haplotypeLikelihoodsPerSample == null ) {
// do the lazy computation // do the lazy computation
final Set<String> samples = haplotypeReadMap.keySet(); final Set<String> samples = new LinkedHashSet<>(readLikelihoods.samples());
haplotypeLikelihoodsPerSample = new LinkedList<Map<Haplotype, Double>>(); haplotypeLikelihoodsPerSample = new LinkedList<>();
for( final String sample : samples ) { for( final String sample : samples ) {
final Map<Haplotype, Double> map = new HashMap<Haplotype, Double>(haplotypes.size()); final Map<Haplotype, Double> map = new HashMap<>(haplotypes.size());
for( final Haplotype h : haplotypes ) { for( final Haplotype h : haplotypes ) {
// count up the co-occurrences of the events for the R^2 calculation // count up the co-occurrences of the events for the R^2 calculation
final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, haplotypeReadMap, Collections.singletonList(Allele.create(h, true)), false)[0][0]; final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, Collections.singletonList(h), false)[0][0];
map.put(h, haplotypeLikelihood); map.put(h, haplotypeLikelihood);
} }
haplotypeLikelihoodsPerSample.add(map); haplotypeLikelihoodsPerSample.add(map);
@ -162,7 +162,7 @@ public class HaplotypeLDCalculator {
* *
* The probability is just p11_22 / (p11_22 + p hets) * The probability is just p11_22 / (p11_22 + p hets)
* *
* @table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] * @param table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1]
* doesn't have to be normalized as this function does the normalization internally * doesn't have to be normalized as this function does the normalization internally
* @return the real space probability that the data is phased * @return the real space probability that the data is phased
*/ */

View File

@ -49,12 +49,15 @@ package org.broadinstitute.gatk.utils.haplotype;
import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.ArrayUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.VariantContextBuilder;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import java.util.*; import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
/** /**
* Merges VariantContexts in a series of haplotypes according to their pairwise LD * Merges VariantContexts in a series of haplotypes according to their pairwise LD
@ -94,19 +97,19 @@ public class LDMerger extends MergeVariantsAcrossHaplotypes {
* Merge as many events among the haplotypes as possible based on pairwise LD among variants * Merge as many events among the haplotypes as possible based on pairwise LD among variants
* *
* @param haplotypes a list of haplotypes whose events we want to merge * @param haplotypes a list of haplotypes whose events we want to merge
* @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype * @param readLikelihoods map from sample name -> read likelihoods for each haplotype
* @param startPosKeySet a set of starting positions of all events among the haplotypes * @param startPosKeySet a set of starting positions of all events among the haplotypes
* @param ref the reference bases * @param ref the reference bases
* @param refLoc the span of the reference bases * @param refLoc the span of the reference bases
*/ */
@Override @Override
public boolean merge( final List<Haplotype> haplotypes, public boolean merge( final List<Haplotype> haplotypes,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap, final ReadLikelihoods<Haplotype> readLikelihoods,
final TreeSet<Integer> startPosKeySet, final TreeSet<Integer> startPosKeySet,
final byte[] ref, final byte[] ref,
final GenomeLoc refLoc ) { final GenomeLoc refLoc ) {
if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null"); if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null");
if ( haplotypeReadMap == null ) throw new IllegalArgumentException("haplotypeReadMap cannot be null"); if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null");
if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null"); if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null");
if ( ref == null ) throw new IllegalArgumentException("ref cannot be null"); if ( ref == null ) throw new IllegalArgumentException("ref cannot be null");
if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null"); if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null");
@ -114,8 +117,8 @@ public class LDMerger extends MergeVariantsAcrossHaplotypes {
if( startPosKeySet.size() <= 1 ) { return false; } if( startPosKeySet.size() <= 1 ) { return false; }
final int nSamples = haplotypeReadMap.keySet().size(); final int nSamples = readLikelihoods.sampleCount();
final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, haplotypeReadMap); final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, readLikelihoods);
boolean somethingWasMerged = false; boolean somethingWasMerged = false;
boolean mapWasUpdated = true; boolean mapWasUpdated = true;
while( mapWasUpdated ) { while( mapWasUpdated ) {
@ -207,7 +210,7 @@ public class LDMerger extends MergeVariantsAcrossHaplotypes {
* @param haplotypes our haplotypes * @param haplotypes our haplotypes
* @param thisStart the starting position of the first event to merge * @param thisStart the starting position of the first event to merge
* @param nextStart the starting position of the next event to merge * @param nextStart the starting position of the next event to merge
* @return * @return never {@code null}.
*/ */
private LDMergeData getPairOfEventsToMerge(final List<Haplotype> haplotypes, final int thisStart, final int nextStart) { private LDMergeData getPairOfEventsToMerge(final List<Haplotype> haplotypes, final int thisStart, final int nextStart) {
final LDMergeData mergeData = new LDMergeData(); final LDMergeData mergeData = new LDMergeData();

View File

@ -47,10 +47,9 @@
package org.broadinstitute.gatk.utils.haplotype; package org.broadinstitute.gatk.utils.haplotype;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.TreeSet; import java.util.TreeSet;
/** /**
@ -63,14 +62,14 @@ public class MergeVariantsAcrossHaplotypes {
* Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate
* *
* @param haplotypes a list of haplotypes whose events we want to merge * @param haplotypes a list of haplotypes whose events we want to merge
* @param haplotypeReadMap map from sample name -> read likelihoods for each haplotype * @param readLikelihoods map from sample name -> read likelihoods for each haplotype
* @param startPosKeySet a set of starting positions of all events among the haplotypes * @param startPosKeySet a set of starting positions of all events among the haplotypes
* @param ref the reference bases * @param ref the reference bases
* @param refLoc the span of the reference bases * @param refLoc the span of the reference bases
* @return true if anything was merged * @return true if anything was merged
*/ */
public boolean merge( final List<Haplotype> haplotypes, public boolean merge( final List<Haplotype> haplotypes,
final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap, final ReadLikelihoods<Haplotype> readLikelihoods,
final TreeSet<Integer> startPosKeySet, final TreeSet<Integer> startPosKeySet,
final byte[] ref, final byte[] ref,
final GenomeLoc refLoc ) { final GenomeLoc refLoc ) {

View File

@ -47,13 +47,13 @@
package org.broadinstitute.gatk.utils.haplotypeBAMWriter; package org.broadinstitute.gatk.utils.haplotypeBAMWriter;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import java.util.*; import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
/** /**
* A haplotype bam writer that writes out all haplotypes as reads and then * A haplotype bam writer that writes out all haplotypes as reads and then
@ -66,6 +66,7 @@ import java.util.*;
* Time: 1:50 PM * Time: 1:50 PM
*/ */
class AllHaplotypeBAMWriter extends HaplotypeBAMWriter { class AllHaplotypeBAMWriter extends HaplotypeBAMWriter {
public AllHaplotypeBAMWriter(final ReadDestination destination) { public AllHaplotypeBAMWriter(final ReadDestination destination) {
super(destination); super(destination);
} }
@ -78,13 +79,11 @@ class AllHaplotypeBAMWriter extends HaplotypeBAMWriter {
final GenomeLoc paddedReferenceLoc, final GenomeLoc paddedReferenceLoc,
final Collection<Haplotype> bestHaplotypes, final Collection<Haplotype> bestHaplotypes,
final Set<Haplotype> calledHaplotypes, final Set<Haplotype> calledHaplotypes,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) { final ReadLikelihoods<Haplotype> readLikelihoods) {
writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc); writeHaplotypesAsReads(haplotypes, new HashSet<>(bestHaplotypes), paddedReferenceLoc);
final int sampleCount = readLikelihoods.sampleCount();
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { for (int s = 0; s < sampleCount; s++)
for( final GATKSAMRecord read : readAlleleLikelihoodMap.getLikelihoodReadMap().keySet() ) { for (final GATKSAMRecord read : readLikelihoods.sampleReads(s))
writeReadAgainstHaplotype(read); writeReadAgainstHaplotype(read);
} }
}
}
} }

View File

@ -47,15 +47,11 @@
package org.broadinstitute.gatk.utils.haplotypeBAMWriter; package org.broadinstitute.gatk.utils.haplotypeBAMWriter;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set; import java.util.Set;
/** /**
@ -82,16 +78,15 @@ class CalledHaplotypeBAMWriter extends HaplotypeBAMWriter {
final GenomeLoc paddedReferenceLoc, final GenomeLoc paddedReferenceLoc,
final Collection<Haplotype> bestHaplotypes, final Collection<Haplotype> bestHaplotypes,
final Set<Haplotype> calledHaplotypes, final Set<Haplotype> calledHaplotypes,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) { final ReadLikelihoods<Haplotype> readLikelihoods) {
if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes if ( calledHaplotypes.isEmpty() ) // only write out called haplotypes
return; return;
writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc); writeHaplotypesAsReads(calledHaplotypes, calledHaplotypes, paddedReferenceLoc);
for ( final PerReadAlleleLikelihoodMap readAlleleLikelihoodMap : stratifiedReadMap.values() ) { final int sampleCount = readLikelihoods.sampleCount();
for ( final GATKSAMRecord read : readAlleleLikelihoodMap.getLikelihoodReadMap().keySet() ) { for (int s = 0; s < sampleCount; s++)
for (final GATKSAMRecord read : readLikelihoods.sampleReads(s))
writeReadAgainstHaplotype(read); writeReadAgainstHaplotype(read);
} }
}
}
} }

View File

@ -46,22 +46,18 @@
package org.broadinstitute.gatk.utils.haplotypeBAMWriter; package org.broadinstitute.gatk.utils.haplotypeBAMWriter;
import htsjdk.samtools.Cigar;
import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMTag; import htsjdk.samtools.SAMTag;
import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
import org.broadinstitute.gatk.utils.sam.CigarUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.smithwaterman.SWPairwiseAlignment;
import java.util.Collection; import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map;
import java.util.Set; import java.util.Set;
/** /**
@ -155,17 +151,17 @@ public abstract class HaplotypeBAMWriter {
* @param paddedReferenceLoc the span of the based reference here * @param paddedReferenceLoc the span of the based reference here
* @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping * @param bestHaplotypes a list of the best (a subset of all) haplotypes that actually went forward into genotyping
* @param calledHaplotypes a list of the haplotypes at where actually called as non-reference * @param calledHaplotypes a list of the haplotypes at where actually called as non-reference
* @param stratifiedReadMap a map from sample -> likelihoods for each read for each of the best haplotypes * @param readLikelihoods a map from sample -> likelihoods for each read for each of the best haplotypes
*/ */
public abstract void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes, public abstract void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
final GenomeLoc paddedReferenceLoc, final GenomeLoc paddedReferenceLoc,
final Collection<Haplotype> bestHaplotypes, final Collection<Haplotype> bestHaplotypes,
final Set<Haplotype> calledHaplotypes, final Set<Haplotype> calledHaplotypes,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap); final ReadLikelihoods<Haplotype> readLikelihoods);
public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes, public void writeReadsAlignedToHaplotypes(final Collection<Haplotype> haplotypes,
final GenomeLoc paddedReferenceLoc, final GenomeLoc paddedReferenceLoc,
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap) { final ReadLikelihoods<Haplotype> stratifiedReadMap) {
writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap); writeReadsAlignedToHaplotypes(haplotypes, paddedReferenceLoc, haplotypes, new HashSet<>(haplotypes), stratifiedReadMap);
} }
@ -210,7 +206,7 @@ public abstract class HaplotypeBAMWriter {
record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar())); record.setCigar(AlignmentUtils.consolidateCigar(haplotype.getCigar()));
record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0); record.setMappingQuality(isAmongBestHaplotypes ? 60 : 0);
record.setReadName("HC" + uniqueNameCounter++); record.setReadName("HC" + uniqueNameCounter++);
record.setAttribute(AlignmentUtils.HAPLOTYPE_TAG, haplotype.hashCode()); record.setAttribute(AlignmentUtils.HAPLOTYPE_TAG,haplotype.hashCode());
record.setReadUnmappedFlag(false); record.setReadUnmappedFlag(false);
record.setReferenceIndex(paddedRefLoc.getContigIndex()); record.setReferenceIndex(paddedRefLoc.getContigIndex());
record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID); record.setAttribute(SAMTag.RG.toString(), READ_GROUP_ID);

View File

@ -46,10 +46,10 @@
package org.broadinstitute.gatk.utils.pairhmm; package org.broadinstitute.gatk.utils.pairhmm;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.io.File; import java.io.File;
import java.lang.reflect.Field; import java.lang.reflect.Field;
@ -192,47 +192,42 @@ public final class CnyPairHMM extends PairHMM implements BatchPairHMM {
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap){ public void computeLikelihoods(final ReadLikelihoods.Matrix<Haplotype> likelihoods, final List<GATKSAMRecord> processedReads, final Map<GATKSAMRecord,byte[]> gcp){
// initialize the pairHMM if necessary // initialize the pairHMM if necessary
if (! initialized) { if (! initialized) {
int readMaxLength = findMaxReadLength(reads); int readMaxLength = findMaxReadLength(processedReads);
int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); int haplotypeMaxLength = findMaxHaplotypeLength(likelihoods.alleles());
initialize(readMaxLength, haplotypeMaxLength); initialize(readMaxLength, haplotypeMaxLength);
} }
// Pass the read bases/quals, and the haplotypes as a list into the HMM // Pass the read bases/quals, and the haplotypes as a list into the HMM
performBatchAdditions(reads, alleleHaplotypeMap, GCPArrayMap); performBatchAdditions(processedReads, likelihoods.alleles(), gcp);
// Get the log10-likelihoods for each read/haplotype ant pack into the results map // Get the log10-likelihoods for each read/haplotype ant pack into the read-likelihoods matrix.
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); collectLikelihoodResults(likelihoods);
collectLikelihoodResults(reads, alleleHaplotypeMap, likelihoodMap);
return likelihoodMap;
} }
private void collectLikelihoodResults(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, PerReadAlleleLikelihoodMap likelihoodMap) { private void collectLikelihoodResults(ReadLikelihoods.Matrix<Haplotype> likelihoods) {
for(final GATKSAMRecord read : reads){ final int numHaplotypes = likelihoods.alleleCount();
final double[] likelihoods = batchGetResult(); final int numReads = likelihoods.readCount();
int jjj = 0; for(int r = 0; r < numReads; r++) {
for (Allele allele : alleleHaplotypeMap.keySet()){ final double[] likelihoodValues = batchGetResult();
final double log10l = likelihoods[jjj];
likelihoodMap.add(read, allele, log10l); for (int h = 0; h < numHaplotypes; h++)
jjj++; likelihoods.set(h,r, likelihoodValues[h]);
}
} }
} }
private void performBatchAdditions(List<GATKSAMRecord> reads, Map<Allele, Haplotype> alleleHaplotypeMap, Map<GATKSAMRecord, byte[]> GCPArrayMap) { private void performBatchAdditions(final List<GATKSAMRecord> reads, final List<Haplotype> haplotypes, Map<GATKSAMRecord,byte[]> gcp) {
final List<Haplotype> haplotypeList = getHaplotypeList(alleleHaplotypeMap);
for(final GATKSAMRecord read : reads){ for(final GATKSAMRecord read : reads){
final byte[] readBases = read.getReadBases(); final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities(); final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read); final byte[] overallGCP = gcp.get(read);
batchAdd(haplotypeList, readBases, readQuals, readInsQuals, readDelQuals, overallGCP); batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
} }
} }

View File

@ -49,23 +49,21 @@ package org.broadinstitute.gatk.utils.pairhmm;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.utils.exceptions.UserException;
import static org.broadinstitute.gatk.utils.pairhmm.PairHMMModel.*;
import java.util.List; import java.io.BufferedWriter;
import java.util.Map;
import java.util.HashMap;
import java.io.File; import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.BufferedWriter;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.broadinstitute.gatk.utils.pairhmm.PairHMMModel.*;
/** /**
@ -156,10 +154,10 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public PerReadAlleleLikelihoodMap computeLikelihoods( final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap ) { public void computeLikelihoods( final ReadLikelihoods.Matrix<Haplotype> likelihoods, final List<GATKSAMRecord> processedReads, final Map<GATKSAMRecord, byte[]> GCPArrayMap ) {
// (re)initialize the pairHMM only if necessary // (re)initialize the pairHMM only if necessary
final int readMaxLength = verify ? findMaxReadLength(reads) : 0; final int readMaxLength = verify ? findMaxReadLength(processedReads) : 0;
final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(likelihoods.alleles()) : 0;
if(verify) if(verify)
{ {
if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength)
@ -167,15 +165,15 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
if ( ! initialized ) if ( ! initialized )
throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode"); throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode");
} }
int readListSize = reads.size(); int readListSize = processedReads.size();
int numHaplotypes = alleleHaplotypeMap.size(); int numHaplotypes = likelihoods.alleleCount();
int numTestcases = readListSize*numHaplotypes; int numTestcases = readListSize*numHaplotypes;
if(debug0_1) if(debug0_1)
System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes);
int idx = 0; int idx = 0;
for(GATKSAMRecord read : reads) for(final GATKSAMRecord read : processedReads)
{ {
byte [] overallGCP = GCPArrayMap.get(read); final byte [] overallGCP = GCPArrayMap.get(read);
if(debug0_1) if(debug0_1)
System.out.println("Java read length "+read.getReadBases().length); System.out.println("Java read length "+read.getReadBases().length);
if(debug3) if(debug3)
@ -195,9 +193,9 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
if(verify) if(verify)
{ {
idx = 0; idx = 0;
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always for (final Haplotype h : likelihoods.alleles()) //order is important - access in same order always
{ {
byte[] haplotypeBases = currEntry.getValue().getBases(); byte[] haplotypeBases = h.getBases();
if(debug0_1) if(debug0_1)
System.out.println("Java haplotype length "+haplotypeBases.length); System.out.println("Java haplotype length "+haplotypeBases.length);
if(debug3) if(debug3)
@ -212,10 +210,10 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
PerReadAlleleLikelihoodMap likelihoodMap = null; PerReadAlleleLikelihoodMap likelihoodMap = null;
if(verify) if(verify)
{ {
jniPairHMM.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); jniPairHMM.computeLikelihoods(likelihoods, processedReads, GCPArrayMap);
likelihoodArray = jniPairHMM.getLikelihoodArray(); likelihoodArray = jniPairHMM.getLikelihoodArray();
//to compare values //to compare values
likelihoodMap = super.computeLikelihoods(reads, alleleHaplotypeMap, GCPArrayMap); super.computeLikelihoods(likelihoods, processedReads, GCPArrayMap);
} }
else else
{ {
@ -234,13 +232,13 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
idx = 0; idx = 0;
int idxInsideHaplotypeList = 0; int idxInsideHaplotypeList = 0;
int readIdx = 0; int readIdx = 0;
for(GATKSAMRecord read : reads) for(final GATKSAMRecord read : processedReads)
{ {
for(int j=0;j<numHaplotypes;++j) for(int j=0;j<numHaplotypes;++j)
tmpArray[j] = likelihoodArray[readIdx+j]; tmpArray[j] = likelihoodArray[readIdx+j];
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always for (final Haplotype haplotype : likelihoods.alleles())//order is important - access in same order always
{ {
idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype);
likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList]; likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList];
++idx; ++idx;
} }
@ -271,13 +269,13 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
idx = 0; idx = 0;
System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes);
boolean firstLine = true; boolean firstLine = true;
for(GATKSAMRecord read : reads) for(final GATKSAMRecord read : processedReads)
{ {
byte [] overallGCP = GCPArrayMap.get(read); byte [] overallGCP = GCPArrayMap.get(read);
byte[] tmpByteArray = new byte[read.getReadBases().length]; byte[] tmpByteArray = new byte[read.getReadBases().length];
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always for (final Haplotype haplotype : likelihoods.alleles()) //order is important - access in same order always
{ {
byte[] haplotypeBases = currEntry.getValue().getBases(); byte[] haplotypeBases = haplotype.getBases();
debugDump("debug_dump.txt",new String(haplotypeBases)+" ",true); debugDump("debug_dump.txt",new String(haplotypeBases)+" ",true);
debugDump("debug_dump.txt",new String(read.getReadBases())+" ",true); debugDump("debug_dump.txt",new String(read.getReadBases())+" ",true);
for(int k=0;k<read.getReadBases().length;++k) for(int k=0;k<read.getReadBases().length;++k)
@ -303,7 +301,7 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
debugDump("debug_results.txt",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true); debugDump("debug_results.txt",String.format("%e %e\n",mLikelihoodArray[idx],likelihoodArray[idx]),true);
else else
if(dumpSandboxOnly) if(dumpSandboxOnly)
likelihoodMap.add(read, currEntry.getKey(), likelihoodArray[idx]); likelihoods.set(likelihoods.alleleIndex(haplotype),likelihoods.readIndex(read), likelihoodArray[idx]);
++idx; ++idx;
} }
} }
@ -314,7 +312,6 @@ public class DebugJNILoglessPairHMM extends LoglessPairHMM {
//if(numComputeLikelihoodCalls == 5) //if(numComputeLikelihoodCalls == 5)
//jniPairHMM.close(); //jniPairHMM.close();
//System.exit(0); //System.exit(0);
return likelihoodMap;
} }
//Used to test parts of the compute kernel separately //Used to test parts of the compute kernel separately

View File

@ -46,26 +46,16 @@
package org.broadinstitute.gatk.utils.pairhmm; package org.broadinstitute.gatk.utils.pairhmm;
import com.google.java.contract.Ensures; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import com.google.java.contract.Requires;
import org.broadinstitute.gatk.utils.QualityUtils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import java.io.*;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.HashMap;
//For loading library from jar //For loading library from jar
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
/** /**
* Created with IntelliJ IDEA. * Created with IntelliJ IDEA.
@ -154,7 +144,7 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray); private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray);
//Hold the mapping between haplotype and index in the list of Haplotypes passed to initialize //Hold the mapping between haplotype and index in the list of Haplotypes passed to initialize
//Use this mapping in computeLikelihoods to find the likelihood value corresponding to a given Haplotype //Use this mapping in computeLikelihoods to find the likelihood value corresponding to a given Haplotype
private HashMap<Haplotype,Integer> haplotypeToHaplotypeListIdxMap = new HashMap<Haplotype,Integer>(); private HashMap<Haplotype,Integer> haplotypeToHaplotypeListIdxMap = new HashMap<>();
private JNIHaplotypeDataHolderClass[] mHaplotypeDataArray; private JNIHaplotypeDataHolderClass[] mHaplotypeDataArray;
@Override @Override
public HashMap<Haplotype, Integer> getHaplotypeToHaplotypeListIdxMap() { return haplotypeToHaplotypeListIdxMap; } public HashMap<Haplotype, Integer> getHaplotypeToHaplotypeListIdxMap() { return haplotypeToHaplotypeListIdxMap; }
@ -204,22 +194,23 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
* {@inheritDoc} * {@inheritDoc}
*/ */
@Override @Override
public PerReadAlleleLikelihoodMap computeLikelihoods( final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap ) { public void computeLikelihoods( final ReadLikelihoods.Matrix<Haplotype> likelihoods, final List<GATKSAMRecord> processedReads, final Map<GATKSAMRecord,byte[]> gcp ) {
if (processedReads.isEmpty())
return;
if(doProfiling) if(doProfiling)
startTime = System.nanoTime(); startTime = System.nanoTime();
int readListSize = reads.size(); int readListSize = processedReads.size();
int numHaplotypes = alleleHaplotypeMap.size(); int numHaplotypes = likelihoods.alleleCount();
int numTestcases = readListSize*numHaplotypes;
JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize]; JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize];
int idx = 0; int idx = 0;
for(GATKSAMRecord read : reads) for(GATKSAMRecord read : processedReads)
{ {
readDataArray[idx] = new JNIReadDataHolderClass(); readDataArray[idx] = new JNIReadDataHolderClass();
readDataArray[idx].readBases = read.getReadBases(); readDataArray[idx].readBases = read.getReadBases();
readDataArray[idx].readQuals = read.getBaseQualities(); readDataArray[idx].readQuals = read.getBaseQualities();
readDataArray[idx].insertionGOP = read.getBaseInsertionQualities(); readDataArray[idx].insertionGOP = read.getBaseInsertionQualities();
readDataArray[idx].deletionGOP = read.getBaseDeletionQualities(); readDataArray[idx].deletionGOP = read.getBaseDeletionQualities();
readDataArray[idx].overallGCP = GCPArrayMap.get(read); readDataArray[idx].overallGCP = gcp.get(read);
++idx; ++idx;
} }
@ -231,19 +222,17 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
// compute_full_prob() // compute_full_prob()
jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, mHaplotypeDataArray, mLikelihoodArray, 12); jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, mHaplotypeDataArray, mLikelihoodArray, 12);
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
idx = 0;
int idxInsideHaplotypeList = 0;
int readIdx = 0; int readIdx = 0;
for(GATKSAMRecord read : reads) for(int r = 0; r < readListSize; r++)
{
for (Map.Entry<Allele, Haplotype> currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always
{ {
int hapIdx = 0;
for (final Haplotype haplotype : likelihoods.alleles()) {
//Since the order of haplotypes in the List<Haplotype> and alleleHaplotypeMap is different, //Since the order of haplotypes in the List<Haplotype> and alleleHaplotypeMap is different,
//get idx of current haplotype in the list and use this idx to get the right likelihoodValue //get idx of current haplotype in the list and use this idx to get the right likelihoodValue
idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); final int idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype);
likelihoodMap.add(read, currEntry.getKey(), mLikelihoodArray[readIdx + idxInsideHaplotypeList]); likelihoods.set(hapIdx,r,mLikelihoodArray[readIdx + idxInsideHaplotypeList]);
++idx; ++hapIdx;
} }
readIdx += numHaplotypes; readIdx += numHaplotypes;
} }
@ -256,7 +245,6 @@ public class VectorLoglessPairHMM extends JNILoglessPairHMM {
pairHMMSetupTime += threadLocalSetupTimeDiff; pairHMMSetupTime += threadLocalSetupTimeDiff;
} }
} }
return likelihoodMap;
} }
/** /**

View File

@ -53,13 +53,14 @@ import java.util.Arrays;
public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; private final static String baseCommand = "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R "
+ b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
// //
// testing normal calling // testing normal calling
// //
// -------------------------------------------------------------------------------------------------------------- // ---------------------------------------------------- ----------------------------------------------------------
@Test @Test
public void testMultiSamplePilot1() { public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(

View File

@ -111,8 +111,8 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark {
@SuppressWarnings("unused") @SuppressWarnings("unused")
public void timeGraphBasedLikelihoods(final int reps) { public void timeGraphBasedLikelihoods(final int reps) {
for (int i = 0; i < reps; i++) { for (int i = 0; i < reps; i++) {
GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX); final GraphBasedLikelihoodCalculationEngineInstance rtlce = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(), new FastLoglessPairHMM((byte)10),Double.NEGATIVE_INFINITY,HeterogeneousKmerSizeResolution.COMBO_MAX);
rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonMap("anonymous", dataSet.readList())); rtlce.computeReadLikelihoods(dataSet.haplotypeList(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList()));
} }
} }
@ -121,7 +121,7 @@ public class HCLikelihoodCalculationEnginesBenchmark extends SimpleBenchmark {
for (int i = 0; i < reps; i++) { for (int i = 0; i < reps; i++) {
final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10, final PairHMMLikelihoodCalculationEngine engine = new PairHMMLikelihoodCalculationEngine((byte) 10,
PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE); PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, true, PairHMMLikelihoodCalculationEngine.PCR_ERROR_MODEL.NONE);
engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonMap("anonymous", dataSet.readList())); engine.computeReadLikelihoods(dataSet.assemblyResultSet(), Collections.singletonList("anonymous"), Collections.singletonMap("anonymous", dataSet.readList()));
} }
} }

View File

@ -49,13 +49,13 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.IndexedFastaSequenceFile;
import htsjdk.tribble.readers.LineIterator; import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.PositionalBufferedStream; import htsjdk.tribble.readers.PositionalBufferedStream;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCodec;
import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.engine.walkers.WalkerTest;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.collections.Pair;
import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVCFUtils;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCodec;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerMultiSample() { public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "60e2f0c3ce33a05c060035d86bc79543"); HCTest(CEUTRIO_BAM, "", "321a0b7cb6ff80691f23566670e1d604");
} }
@Test @Test
@ -99,12 +99,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerGraphBasedSingleSample() { public void testHaplotypeCallerGraphBasedSingleSample() {
HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "07910f50710349eacd2560452fac3e8d"); HCTest(NA12878_BAM, "-likelihoodEngine GraphBased", "6cf15ddbfa4a3738e891fd9a09da8d07");
} }
@Test @Test
public void testHaplotypeCallerGraphBasedMultiSample() { public void testHaplotypeCallerGraphBasedMultiSample() {
HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "c5ef449a46b80b69dde87aa52041fe50"); HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased", "4c2a2dad6379b13fee4c7faca17441f5");
} }
@Test(enabled = false) // can't annotate the rsID's yet @Test(enabled = false) // can't annotate the rsID's yet
@ -115,7 +115,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerMultiSampleGGA() { public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf",
"b61e0bdf0e3180cb4f5abd3491b05aa6"); "4047ea70f37b59b782402f678ea2dfee");
} }
@Test @Test
@ -244,7 +244,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
public void HCTestDBSNPAnnotationWGSGraphBased() { public void HCTestDBSNPAnnotationWGSGraphBased() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("fc2e81d02c6bbf15147a46797402fda7")); Arrays.asList("7748a90ffd7daa15a8d518fc528bfcc5"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
} }
@ -253,7 +253,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 "-T HaplotypeCaller -likelihoodEngine GraphBased --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1, + " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("83bf354f0238a362d4b6ef8d14679999")); Arrays.asList("6bea6d26a3512a7b74d9610d188b6e9a"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
} }

View File

@ -262,7 +262,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi
dataSet = (ActiveRegionTestDataSet) params[0]; dataSet = (ActiveRegionTestDataSet) params[0];
if (INTRODUCE_READ_ERRORS) dataSet.introduceErrors(new Random(13)); if (INTRODUCE_READ_ERRORS) dataSet.introduceErrors(new Random(13));
graphEngine = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(),hmm,Double.NEGATIVE_INFINITY, HeterogeneousKmerSizeResolution.COMBO_MAX); graphEngine = new GraphBasedLikelihoodCalculationEngineInstance(dataSet.assemblyResultSet(),hmm,Double.NEGATIVE_INFINITY, HeterogeneousKmerSizeResolution.COMBO_MAX);
graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(),Collections.singletonMap("anonymous",dataSet.readList())).get("anonymous"); graphLks = graphEngine.computeReadLikelihoods(dataSet.haplotypeList(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",dataSet.readList())).toPerReadAlleleLikelihoodMap(0);
// clip reads at the anchors. // clip reads at the anchors.
final Map<GATKSAMRecord,GATKSAMRecord> clippedReads = anchorClippedReads(graphEngine.getHaplotypeGraph(),dataSet.readList()); final Map<GATKSAMRecord,GATKSAMRecord> clippedReads = anchorClippedReads(graphEngine.getHaplotypeGraph(),dataSet.readList());
@ -272,7 +272,7 @@ public class ReadThreadingLikelihoodCalculationEngineUnitTest extends ActiveRegi
clippedReadList.add(clippedReads.containsKey(r) ? clippedReads.get(r) : r); clippedReadList.add(clippedReads.containsKey(r) ? clippedReads.get(r) : r);
} }
loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),Collections.singletonMap("anonymous",clippedReadList)).get("anonymous"); loglessLks = fullPairHMM.computeReadLikelihoods(dataSet.assemblyResultSet(),Collections.singletonList("anonymous"),Collections.singletonMap("anonymous",clippedReadList)).toPerReadAlleleLikelihoodMap(0);
// Change clipped by unclipped in the resulting likelihood map. // Change clipped by unclipped in the resulting likelihood map.
for (final GATKSAMRecord r : clippedReads.keySet()) { for (final GATKSAMRecord r : clippedReads.keySet()) {

View File

@ -48,9 +48,12 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller;
import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader;
import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.BaseTest;
import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.UnvalidatingGenomeLoc;
import org.broadinstitute.gatk.utils.Utils;
import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
@ -285,7 +288,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
} }
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); final ReadLikelihoods<Haplotype> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);
@ -302,7 +305,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
final List<VariantContext> calls = Collections.emptyList(); final List<VariantContext> calls = Collections.emptyList();
data.getActiveRegion().add(data.makeRead(start, readLen)); data.getActiveRegion().add(data.makeRead(start, readLen));
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); final ReadLikelihoods<Haplotype> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
final List<Integer> expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0)); final List<Integer> expectedDPs = new ArrayList<>(Collections.nCopies(data.getActiveRegion().getLocation().size(), 0));
for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1); for ( int i = start; i < readLen + start; i++ ) expectedDPs.set(i, 1);
@ -337,7 +340,7 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest {
data.getActiveRegion().add(data.makeRead(0, data.getRefLength())); data.getActiveRegion().add(data.makeRead(0, data.getRefLength()));
} }
final Map<String, PerReadAlleleLikelihoodMap> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion()); final ReadLikelihoods<Haplotype> likelihoods = HaplotypeCaller.createDummyStratifiedReadMap(data.getRefHap(), new ArrayList<>(samples), data.getActiveRegion());
final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads); final List<Integer> expectedDPs = Collections.nCopies(data.getActiveRegion().getLocation().size(), nReads);
final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls); final List<VariantContext> contexts = model.calculateRefConfidence(data.getRefHap(), haplotypes, data.getPaddedRefLoc(), data.getActiveRegion(), likelihoods, calls);

View File

@ -0,0 +1,609 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.gatk.utils.genotyper;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.variant.variantcontext.Allele;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.GenomeLocParser;
import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
/**
* Test code for {@link ReadLikelihoods}
*
* @author Valentin Ruano-Rubio &lt;valentin@broadinstitute.org&gt;
*/
public class ReadLikelihoodsUnitTest
{
private static final double EPSILON = 1e-6;
private static final int ODD_READ_START = 101;
private static final int EVEN_READ_START = 1;
@Test(dataProvider = "dataSets")
public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
Assert.assertEquals(result.sampleCount(), samples.length);
Assert.assertEquals(result.alleleCount(), alleles.length);
testSampleQueries(samples, reads, result);
testAlleleQueries(alleles, result);
testLikelihoodMatrixQueries(samples, result, null);
}
@Test(dataProvider = "dataSets")
public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> result = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result);
testLikelihoodMatrixQueries(samples, result, likelihoods);
}
private double[][][] fillWithRandomLikelihoods(final String[] samples, final Allele[] alleles, final ReadLikelihoods<Allele> result) {
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
final double[][][] likelihoods = new double[samples.length][alleles.length][];
for (int s = 0; s < likelihoods.length; s++) {
final ReadLikelihoods.Matrix<Allele> sampleLikelihoods = result.sampleMatrix(s);
for (int a = 0; a < likelihoods[s].length; a++) {
likelihoods[s][a] = new double[result.sampleReadCount(s)];
for (int r = 0; r < likelihoods[s][a].length; r++)
sampleLikelihoods.set(a,r,likelihoods[s][a][r] = -Math.abs(rnd.nextGaussian()));
}
}
return likelihoods;
}
@Test(dataProvider = "dataSets")
public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
fillWithRandomLikelihoods(samples,alleles,original);
final int alleleCount = alleles.length;
for (int s = 0; s < samples.length; s++) {
final int sampleReadCount = original.sampleReadCount(s);
final ReadLikelihoods.Matrix<Allele> sampleMatrix = original.sampleMatrix(s);
final double[] bestLkArray = new double[sampleReadCount];
final int[] bestIndexArray = new int[sampleReadCount];
final double[] confidenceArray = new double[sampleReadCount];
for (int r = 0; r < sampleReadCount; r++) {
int bestAlleleIndex = -1;
double bestAlleleLk = Double.NEGATIVE_INFINITY;
double secondBestAlleleLk = Double.NEGATIVE_INFINITY;
for (int a = 0; a < alleleCount; a++) {
final double lk = sampleMatrix.get(a,r);
if (lk > bestAlleleLk) {
secondBestAlleleLk = bestAlleleLk;
bestAlleleLk = lk;
bestAlleleIndex = a;
} else if (lk > secondBestAlleleLk) {
secondBestAlleleLk = lk;
}
}
bestLkArray[r] = bestAlleleLk;
confidenceArray[r] = bestAlleleLk - secondBestAlleleLk;
bestIndexArray[r] = bestAlleleIndex;
}
final Collection<ReadLikelihoods<Allele>.BestAllele> bestAlleles = original.bestAlleles();
for (final ReadLikelihoods<Allele>.BestAllele bestAllele : bestAlleles) {
final int readIndex = original.readIndex(s,bestAllele.read);
if (readIndex == -1) continue;
Assert.assertEquals(bestLkArray[readIndex],bestAllele.likelihood);
Assert.assertEquals(bestAllele.allele,alleles[bestIndexArray[readIndex]]);
Assert.assertEquals(bestAllele.confidence,confidenceArray[readIndex],EPSILON);
}
}
}
@Test(dataProvider = "dataSets")
public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
fillWithRandomLikelihoods(samples,alleles,original);
final Map<Allele,List<GATKSAMRecord>> expected = new HashMap<>(alleles.length);
for (final Allele allele : alleles)
expected.put(allele,new ArrayList<GATKSAMRecord>());
final int alleleCount = alleles.length;
for (int s = 0; s < samples.length; s++) {
final int sampleReadCount = original.sampleReadCount(s);
final ReadLikelihoods.Matrix<Allele> sampleMatrix = original.sampleMatrix(s);
for (int r = 0; r < sampleReadCount; r++) {
int bestAlleleIndex = -1;
double bestAlleleLk = Double.NEGATIVE_INFINITY;
double secondBestAlleleLk = Double.NEGATIVE_INFINITY;
for (int a = 0; a < alleleCount; a++) {
final double lk = sampleMatrix.get(a,r);
if (lk > bestAlleleLk) {
secondBestAlleleLk = bestAlleleLk;
bestAlleleLk = lk;
bestAlleleIndex = a;
} else if (lk > secondBestAlleleLk) {
secondBestAlleleLk = lk;
}
}
if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD)
expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.read(r));
}
}
final Map<Allele,List<GATKSAMRecord>> actual = original.readsByBestAlleleMap();
Assert.assertEquals(actual.size(),alleles.length);
for (final Allele allele : alleles) {
final List<GATKSAMRecord> expectedList = expected.get(allele);
final List<GATKSAMRecord> actualList = actual.get(allele);
final Set<GATKSAMRecord> expectedSet = new HashSet<>(expectedList);
final Set<GATKSAMRecord> actualSet = new HashSet<>(actualList);
Assert.assertEquals(actualSet,expectedSet);
}
}
@Test(dataProvider = "dataSets")
public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
for (int s = 0; s < samples.length; s++) {
final int sampleReadCount = original.sampleReadCount(s);
for (int r = 0; r < sampleReadCount; r++) {
if ((r & 1) == 0) continue;
for (int a = 0; a < alleles.length; a++)
original.sampleMatrix(s).set(a,r,-10000);
}
}
final ReadLikelihoods<Allele> result = original.clone();
result.filterPoorlyModeledReads(2.0);
for (int s = 0; s < samples.length; s++) {
final int oldSampleReadCount = original.sampleReadCount(s);
final int newSampleReadCount = result.sampleReadCount(s);
Assert.assertEquals(newSampleReadCount,(oldSampleReadCount + 1) / 2);
final ReadLikelihoods.Matrix<Allele> newSampleMatrix = result.sampleMatrix(s);
final ReadLikelihoods.Matrix<Allele> oldSampleMatrix = original.sampleMatrix(s);
for (int r = 0 ; r < newSampleReadCount; r++) {
Assert.assertEquals(original.readIndex(s, result.sampleReads(s).get(r)), r * 2);
for (int a = 0; a < alleles.length; a++) {
Assert.assertEquals(newSampleMatrix.get(a,r),oldSampleMatrix.get(a,r*2));
}
}
}
}
@Test(dataProvider = "dataSets")
public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START );
fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods<Allele> result = original.clone();
result.filterToOnlyOverlappingUnclippedReads(evenReadOverlap);
final double[][][] newLikelihoods = new double[samples.length][alleles.length][];
for (int s = 0; s < samples.length ; s++)
for (int a = 0; a < alleles.length; a++) {
newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2];
final ReadLikelihoods.Matrix<Allele> sampleMatrix = original.sampleMatrix(s);
for (int r = 0; r < newLikelihoods[s][a].length; r++) {
Assert.assertEquals(result.readIndex(s,sampleMatrix.read(r << 1)),r);
newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1);
}
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}
@Test(dataProvider = "marginalizationDataSets")
public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads, final Map<Allele,List<Allele>> newToOldAlleleMapping) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START );
fillWithRandomLikelihoods(samples, alleles, original);
final ReadLikelihoods<Allele> marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap);
Assert.assertNotNull(marginalized);
Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount());
for (int a = 0; a < marginalized.alleleCount(); a++) {
final List<Allele> oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a));
Assert.assertNotNull(oldAlleles);
for (int s = 0; s < samples.length; s++) {
final ReadLikelihoods.Matrix<Allele> oldSmapleLikelihoods = original.sampleMatrix(s);
final ReadLikelihoods.Matrix<Allele> sampleLikelihoods = marginalized.sampleMatrix(s);
final int sampleReadCount = sampleLikelihoods.readCount();
final int oldSampleReadCount = oldSmapleLikelihoods.readCount();
Assert.assertEquals(sampleReadCount,(oldSampleReadCount + 1) / 2);
for (int r = 0; r < sampleReadCount; r++) {
double oldBestLk = Double.NEGATIVE_INFINITY;
for (final Allele oldAllele : oldAlleles) {
oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r << 1), oldBestLk);
}
Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk);
}
}
}
}
@Test(dataProvider = "marginalizationDataSets")
public void testMarginalization(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads, final Map<Allele,List<Allele>> newToOldAlleleMapping) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
fillWithRandomLikelihoods(samples, alleles, original);
final ReadLikelihoods<Allele> marginalized = original.marginalize(newToOldAlleleMapping);
Assert.assertNotNull(marginalized);
Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount());
for (int a = 0; a < marginalized.alleleCount(); a++) {
final List<Allele> oldAlleles = newToOldAlleleMapping.get(marginalized.allele(a));
Assert.assertNotNull(oldAlleles);
for (int s = 0; s < samples.length; s++) {
final ReadLikelihoods.Matrix<Allele> oldSmapleLikelihoods = original.sampleMatrix(s);
final ReadLikelihoods.Matrix<Allele> sampleLikelihoods = marginalized.sampleMatrix(s);
final int sampleReadCount = sampleLikelihoods.readCount();
final int oldSampleReadCount = oldSmapleLikelihoods.readCount();
Assert.assertEquals(oldSampleReadCount,sampleReadCount);
for (int r = 0; r < sampleReadCount; r++) {
double oldBestLk = Double.NEGATIVE_INFINITY;
for (final Allele oldAllele : oldAlleles) {
oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r), oldBestLk);
}
Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk);
}
}
}
}
@Test(dataProvider = "dataSets")
public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods<Allele> result= original.clone();
result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY);
testAlleleQueries(alleles,result);
final int alleleCount = alleles.length;
final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][];
for (int s = 0; s < samples.length; s++) {
final int sampleReadCount = original.sampleReadCount(s);
for (int a = 0; a < alleleCount; a++)
newLikelihoods[s][a] = new double[sampleReadCount];
for (int r = 0; r < sampleReadCount; r++) {
double bestLk = originalLikelihoods[s][0][r];
for (int a = 1; a < alleleCount; a++) {
bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]);
}
for (int a = 0; a < alleleCount; a++) {
newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk;
}
}
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}
@Test(dataProvider = "dataSets")
public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods<Allele> result= original.clone();
result.normalizeLikelihoods(false, - 0.001);
testAlleleQueries(alleles,result);
final int alleleCount = alleles.length;
final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][];
for (int s = 0; s < samples.length; s++) {
final int sampleReadCount = original.sampleReadCount(s);
for (int a = 0; a < alleleCount; a++)
newLikelihoods[s][a] = new double[sampleReadCount];
for (int r = 0; r < sampleReadCount; r++) {
double bestAltLk = Double.NEGATIVE_INFINITY;
for (int a = 0; a < alleleCount; a++) {
if (alleles[a].isReference())
continue;
bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]);
}
if (bestAltLk == Double.NEGATIVE_INFINITY)
for (int a = 0; a < alleleCount; a++) {
newLikelihoods[s][a][r] = originalLikelihoods[s][a][r];
}
else
for (int a = 0; a < alleleCount; a++) {
newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001);
}
}
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}
@Test(dataProvider = "dataSets")
public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods<Allele> result= original.clone();
result.normalizeLikelihoods(true, - 0.001);
testAlleleQueries(alleles,result);
final int alleleCount = alleles.length;
final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][];
for (int s = 0; s < samples.length; s++) {
final int sampleReadCount = original.sampleReadCount(s);
for (int a = 0; a < alleleCount; a++)
newLikelihoods[s][a] = new double[sampleReadCount];
for (int r = 0; r < sampleReadCount; r++) {
double bestAltLk = Double.NEGATIVE_INFINITY;
double bestLk = Double.NEGATIVE_INFINITY;
for (int a = 0; a < alleleCount; a++) {
bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]);
if (alleles[a].isReference())
continue;
bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]);
}
if (bestAltLk == Double.NEGATIVE_INFINITY)
for (int a = 0; a < alleleCount; a++) {
newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk;
}
else
for (int a = 0; a < alleleCount; a++) {
newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001) - bestLk;
}
}
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}
@Test(dataProvider = "dataSets")
public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods<Allele> result = original.clone();
// If all the alleles pass are present in the read-likelihoods collection there is no change.
result.addMissingAlleles(result.alleles(),Double.NEGATIVE_INFINITY);
testLikelihoodMatrixQueries(samples,result,originalLikelihoods);
// If the allele list passed is empty there is no effect.
result.addMissingAlleles(Collections.EMPTY_LIST,Double.NEGATIVE_INFINITY);
testLikelihoodMatrixQueries(samples,result,originalLikelihoods);
final Allele newOne;
final Allele newTwo;
final Allele newThree;
// We add a single missing.
result.addMissingAlleles(Arrays.asList(newOne = Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-12345.6);
Assert.assertEquals(original.alleleCount() + 1, result.alleleCount());
// We add too more amongst exisisting alleles:
result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.allele(1),
result.allele(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321);
Assert.assertEquals(original.alleleCount()+3,result.alleleCount());
final List<Allele> expectedAlleles = new ArrayList<>(original.alleles());
expectedAlleles.add(newOne); expectedAlleles.add(newTwo); expectedAlleles.add(newThree);
Assert.assertEquals(result.alleles(),expectedAlleles);
final double[][][] newLikelihoods = new double[originalLikelihoods.length][][];
for (int s = 0; s < samples.length; s++) {
newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 3);
final int sampleReadCount = original.sampleReadCount(s);
final int originalAlleleCount = originalLikelihoods[s].length;
newLikelihoods[s][originalAlleleCount] = new double[sampleReadCount];
Arrays.fill(newLikelihoods[s][originalAlleleCount],-12345.6);
newLikelihoods[s][originalAlleleCount+1] = new double[sampleReadCount];
Arrays.fill(newLikelihoods[s][originalAlleleCount+1],-6.54321);
newLikelihoods[s][originalAlleleCount+2] = new double[sampleReadCount];
Arrays.fill(newLikelihoods[s][originalAlleleCount+2],-6.54321);
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}
@Test(dataProvider = "dataSets")
public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map<String,List<GATKSAMRecord>> reads) {
final ReadLikelihoods<Allele> original = new ReadLikelihoods<>(Arrays.asList(samples), Arrays.asList(alleles), reads);
final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original);
final ReadLikelihoods<Allele> result = original.clone();
result.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
Assert.assertEquals(result.alleleCount(),original.alleleCount() + 1);
Assert.assertEquals(result.alleleIndex(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE),result.alleleCount() - 1);
final double[][][] newLikelihoods = new double[originalLikelihoods.length][][];
for (int s = 0; s < samples.length; s++) {
newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 1);
final int sampleReadCount = original.sampleReadCount(s);
final int ordinaryAlleleCount = originalLikelihoods[s].length;
newLikelihoods[s][ordinaryAlleleCount] = new double[sampleReadCount];
for (int r = 0; r < sampleReadCount; r++) {
double bestLk = newLikelihoods[s][0][r];
double secondBestLk = Double.NEGATIVE_INFINITY;
for (int a = 1; a < ordinaryAlleleCount; a++) {
final double lk = originalLikelihoods[s][a][r];
if (lk > bestLk) {
secondBestLk = bestLk;
bestLk = lk;
} else if (lk > secondBestLk) {
secondBestLk = lk;
}
}
final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk;
newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk;
}
}
testLikelihoodMatrixQueries(samples,result,newLikelihoods);
}
private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods<Allele> result, final double[][][] likelihoods) {
for (final String sample : samples) {
final int sampleIndex = result.sampleIndex(sample);
final double[][] likelihoodMatrix = result.sampleValues(sampleIndex);
final int sampleReadCount = result.sampleReadCount(sampleIndex);
Assert.assertEquals(result.alleleCount(), likelihoodMatrix.length);
for (int a = 0; a < likelihoodMatrix.length; a++) {
Assert.assertEquals(likelihoodMatrix[a].length,sampleReadCount);
for (int r = 0; r < sampleReadCount; r++)
Assert.assertEquals(likelihoodMatrix[a][r],
likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON);
}
}
}
private void testAlleleQueries(Allele[] alleles, ReadLikelihoods<Allele> result) {
final Set<Integer> alleleIndices = new HashSet<>();
for (final Allele allele : alleles) {
final int alleleIndex = result.alleleIndex(allele);
Assert.assertTrue(alleleIndex >= 0);
Assert.assertFalse(alleleIndices.contains(alleleIndex));
alleleIndices.add(alleleIndex);
Assert.assertSame(allele,alleles[alleleIndex]);
}
}
private void testSampleQueries(String[] samples, Map<String, List<GATKSAMRecord>> reads, ReadLikelihoods<Allele> result) {
final Set<Integer> sampleIds = new HashSet<>(samples.length);
for (final String sample : samples) {
final int sampleIndex = result.sampleIndex(sample);
Assert.assertTrue(sampleIndex >= 0);
Assert.assertFalse(sampleIds.contains(sampleIndex));
sampleIds.add(sampleIndex);
final List<GATKSAMRecord> sampleReads = result.sampleReads(sampleIndex);
final Set<GATKSAMRecord> sampleReadsSet = new HashSet<>(sampleReads);
final List<GATKSAMRecord> expectedSampleReadArray = reads.get(sample);
final Set<GATKSAMRecord> expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray);
Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet);
final int sampleReadCount = sampleReads.size();
for (int r = 0; r < sampleReadCount; r++) {
Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r));
final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r));
Assert.assertEquals(readIndex,r);
}
}
}
private String[][] SAMPLE_SETS = new String[][] {
{"A","B","C"},
{"A"},
{"C","A","D","E","Salsa","Gazpacho"},
};
private Allele[][] ALLELE_SETS = new Allele[][] {
{Allele.create("A",true), Allele.create("T"), Allele.create("C")},
{Allele.create("A",true)},
{Allele.create("ATTTA"), Allele.create("A",true)},
{Allele.create("A"), Allele.create("AT",true)},
{Allele.create("A",false), Allele.create("AT",false)},
};
@DataProvider(name="marginalizationDataSets")
public Object[][] marginalizationDataSets() {
try {
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length * ALLELE_SETS.length][];
int nextIndex = 0;
for (int s = 0; s < SAMPLE_SETS.length; s++)
for (int a = 0; a < ALLELE_SETS.length; a++) {
for (int b = 0; b < ALLELE_SETS.length; b++) {
if (ALLELE_SETS[b].length < ALLELE_SETS[a].length)
result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a],
dataSetReads(SAMPLE_SETS[s], rnd), randomAlleleMap(ALLELE_SETS[a], ALLELE_SETS[b])
};
}
}
return Arrays.copyOf(result,nextIndex);
}catch (final Throwable e) {
throw new RuntimeException(e);
}
}
private Map<Allele,List<Allele>> randomAlleleMap(final Allele[] fromAlleles, final Allele[] toAlleles) {
final Map<Allele,List<Allele>> result = new HashMap<>(toAlleles.length);
for (final Allele toAllele : toAlleles )
result.put(toAllele,new ArrayList<Allele>(fromAlleles.length));
final ArrayList<Allele> remaining = new ArrayList<>(Arrays.asList(fromAlleles));
int nextToIndex = 0;
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
for (int i = 0; i < fromAlleles.length; i++) {
final int fromAlleleIndex = rnd.nextInt(remaining.size());
result.get(toAlleles[nextToIndex]).add(remaining.remove(fromAlleleIndex));
nextToIndex = (nextToIndex + 1) % toAlleles.length;
}
return result;
}
@DataProvider(name="dataSets")
public Object[][] dataSets() {
try {
final Random rnd = GenomeAnalysisEngine.getRandomGenerator();
final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length][];
int nextIndex = 0;
for (int s = 0; s < SAMPLE_SETS.length; s++)
for (int a = 0; a < ALLELE_SETS.length; a++) {
result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a],
dataSetReads(SAMPLE_SETS[s], rnd)
};
}
return result;
}catch (final Throwable e) {
throw new RuntimeException(e);
}
}
final SAMFileHeader SAM_HEADER = ArtificialSAMUtils.createArtificialSamHeader();
final GenomeLocParser locParser = new GenomeLocParser(SAM_HEADER.getSequenceDictionary());
private Map<String,List<GATKSAMRecord>> dataSetReads(final String[] samples,
final Random rnd) {
final Map<String,List<GATKSAMRecord>> result = new HashMap<>(samples.length);
for (final String sample : samples) {
final int readCount = rnd.nextInt(100);
final List<GATKSAMRecord> reads = new ArrayList<>(readCount);
for (int r = 0; r < readCount; r++) {
final int alignmentStart = (r & 1) == 0 ? EVEN_READ_START : ODD_READ_START;
reads.add(ArtificialSAMUtils.createArtificialRead(SAM_HEADER,
"RRR" + sample + "00" + r, 0, alignmentStart ,"AAAAA".getBytes(), new byte[] {30,30,30,30,30}, "5M"));
}
result.put(sample,reads);
}
return result;
}
}

View File

@ -25,13 +25,16 @@
package org.broadinstitute.gatk.engine.downsampling; package org.broadinstitute.gatk.engine.downsampling;
import org.broadinstitute.gatk.utils.*; import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.collections.DefaultHashMap; import org.broadinstitute.gatk.utils.collections.DefaultHashMap;
import org.broadinstitute.gatk.utils.exceptions.GATKException; import org.broadinstitute.gatk.utils.exceptions.GATKException;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.pileup.*; import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup;
import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.text.XReadLines; import org.broadinstitute.gatk.utils.text.XReadLines;
import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Allele;
@ -39,8 +42,6 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.*; import java.util.*;
import org.apache.log4j.Logger;
public class AlleleBiasedDownsamplingUtils { public class AlleleBiasedDownsamplingUtils {
// define this class so that we can use Java generics below // define this class so that we can use Java generics below
@ -216,7 +217,7 @@ public class AlleleBiasedDownsamplingUtils {
* @param downsamplingFraction the fraction of total reads to remove per allele * @param downsamplingFraction the fraction of total reads to remove per allele
* @return list of reads TO REMOVE from allele biased down-sampling * @return list of reads TO REMOVE from allele biased down-sampling
*/ */
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction) { public static <A extends Allele> List<GATKSAMRecord> selectAlleleBiasedReads(final Map<A, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction) {
int totalReads = 0; int totalReads = 0;
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() ) for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
totalReads += reads.size(); totalReads += reads.size();

View File

@ -27,17 +27,18 @@ package org.broadinstitute.gatk.tools.walkers.annotator;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.broadinstitute.gatk.utils.commandline.RodBinding; import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.*;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.contexts.AlignmentContext;
import org.broadinstitute.gatk.engine.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.contexts.ReferenceContext;
import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*;
import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLoc;
import org.broadinstitute.gatk.utils.commandline.RodBinding;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import htsjdk.variant.variantcontext.*; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import htsjdk.variant.vcf.*;
import java.util.*; import java.util.*;
@ -204,6 +205,15 @@ public class VariantAnnotatorEngine {
return annotateDBs(tracker, annotated); return annotateDBs(tracker, annotated);
} }
public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tracker,
final ReadLikelihoods<Allele> readLikelihoods,
final VariantContext vc) {
//TODO we transform the read-likelihood into the Map^2 previous version for the sake of not changing of not changing annotation interface.
//TODO should we change those interfaces?
final Map<String, PerReadAlleleLikelihoodMap> annotationLikelihoods = readLikelihoods.toPerReadAlleleLikelihoodMap();
return annotateContextForActiveRegion(tracker, annotationLikelihoods, vc);
}
public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tracker, public VariantContext annotateContextForActiveRegion(final RefMetaDataTracker tracker,
final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap, final Map<String, PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap,
final VariantContext vc) { final VariantContext vc) {

View File

@ -34,6 +34,7 @@ import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter;
import org.broadinstitute.gatk.utils.text.TextFormattingUtils; import org.broadinstitute.gatk.utils.text.TextFormattingUtils;
import java.lang.reflect.Array;
import java.math.BigInteger; import java.math.BigInteger;
import java.net.InetAddress; import java.net.InetAddress;
import java.security.MessageDigest; import java.security.MessageDigest;
@ -882,4 +883,258 @@ public class Utils {
return false; return false;
return true; return true;
} }
/**
* Skims out positions of an array returning a shorter one with the remaning positions in the same order.
* @param original the original array to splice.
* @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}),
* or retained ({@code false})
*
* @param <T> the array type.
*
* @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null},
* or {@code remove length is different to {@code original}'s}, or {@code original} is not in
* fact an array.
*
* @return never {@code null}.
*/
public static <T> T skimArray(final T original, final boolean[] remove) {
return skimArray(original,0,null,0,remove,0);
}
/**
* Skims out positions of an array returning a shorter one with the remaning positions in the same order.
*
* <p>
* If the {@code dest} array provide is not long enough a new one will be created and returned with the
* same component type. All elements before {@code destOffset} will be copied from the input to the
* result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where
* the position preceding {@code destOffset} will be left with the default value. The component type
* Will match the one of the {@code source} array.
* </p>
*
* @param source the original array to splice.
* @param sourceOffset the first position to skim.
* @param dest the destination array.
* @param destOffset the first position where to copy the skimed array values.
* @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}),
* or retained ({@code false})
* @param removeOffset the first position in the remove index array to consider.
*
* @param <T> the array type.
*
* @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null},
* or {@code remove length is different to {@code original}'s}, or {@code original} is not in
* fact an array.
*
* @return never {@code null}.
*/
public static <T> T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, final boolean[] remove, final int removeOffset) {
if (source == null)
throw new IllegalArgumentException("the source array cannot be null");
@SuppressWarnings("unchecked")
final Class<T> sourceClazz = (Class<T>) source.getClass();
if (!sourceClazz.isArray())
throw new IllegalArgumentException("the source array is not in fact an array instance");
final int length = Array.getLength(source) - sourceOffset;
if (length < 0)
throw new IllegalArgumentException("the source offset goes beyond the source array length");
return skimArray(source,sourceOffset,dest,destOffset,remove,removeOffset,length);
}
/**
* Skims out positions of an array returning a shorter one with the remaning positions in the same order.
*
* <p>
* If the {@code dest} array provide is not long enough a new one will be created and returned with the
* same component type. All elements before {@code destOffset} will be copied from the input to the
* result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where
* the position preceding {@code destOffset} will be left with the default value. The component type
* Will match the one of the {@code source} array.
* </p>
*
* @param source the original array to splice.
* @param sourceOffset the first position to skim.
* @param dest the destination array.
* @param destOffset the first position where to copy the skimed array values.
* @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}),
* or retained ({@code false})
* @param removeOffset the first position in the remove index array to consider.
* @param length the total number of position in {@code source} to consider. Thus only the {@code sourceOffset} to
* {@code sourceOffset + length - 1} region will be skimmed.
*
* @param <T> the array type.
*
* @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null},
* or {@code remove length is different to {@code original}'s}, or {@code original} is not in
* fact an array.
*
* @return never {@code null}.
*/
public static <T> T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset,
final boolean[] remove, final int removeOffset, final int length) {
if (source == null)
throw new IllegalArgumentException("the source array cannot be null");
if (remove == null)
throw new IllegalArgumentException("the remove array cannot be null");
if (sourceOffset < 0)
throw new IllegalArgumentException("the source array offset cannot be negative");
if (destOffset < 0)
throw new IllegalArgumentException("the destination array offset cannot be negative");
if (removeOffset < 0)
throw new IllegalArgumentException("the remove array offset cannot be negative");
if (length < 0)
throw new IllegalArgumentException("the length provided cannot be negative");
final int removeLength = Math.min(remove.length - removeOffset,length);
if (removeLength < 0)
throw new IllegalArgumentException("the remove offset provided falls beyond the remove array end");
@SuppressWarnings("unchecked")
final Class<T> sourceClazz = (Class<T>) source.getClass();
if (!sourceClazz.isArray())
throw new IllegalArgumentException("the source array is not in fact an array instance");
final Class<T> destClazz = skimArrayDetermineDestArrayClass(dest, sourceClazz);
final int sourceLength = Array.getLength(source);
if (sourceLength < length + sourceOffset)
throw new IllegalArgumentException("the source array is too small considering length and offset");
// count how many positions are to be removed.
int removeCount = 0;
final int removeEnd = removeLength + removeOffset;
for (int i = removeOffset; i < removeEnd; i++)
if (remove[i]) removeCount++;
final int newLength = length - removeCount;
@SuppressWarnings("unchecked")
final T result = skimArrayBuildResultArray(dest, destOffset, destClazz, newLength);
// No removals, just copy the whole thing.
if (removeCount == 0)
System.arraycopy(source,sourceOffset,result,destOffset,length);
else if (length > 0) { // if length == 0 nothing to do.
int nextOriginalIndex = 0;
int nextNewIndex = 0;
int nextRemoveIndex = removeOffset;
while (nextOriginalIndex < length && nextNewIndex < newLength) {
while (nextRemoveIndex < removeEnd && remove[nextRemoveIndex++]) { nextOriginalIndex++; } // skip positions to be spliced.
// Since we make the nextNewIndex < newLength check in the while condition
// there is no need to include the following break, as is guaranteed not to be true:
// if (nextOriginalIndex >= length) break; // we reach the final (last positions are to be spliced.
final int copyStart = nextOriginalIndex;
while (++nextOriginalIndex < length && (nextRemoveIndex >= removeEnd || !remove[nextRemoveIndex])) { nextRemoveIndex++; }
final int copyEnd = nextOriginalIndex;
final int copyLength = copyEnd - copyStart;
System.arraycopy(source, sourceOffset + copyStart, result, destOffset + nextNewIndex, copyLength);
nextNewIndex += copyLength;
}
}
return result;
}
private static <T> T skimArrayBuildResultArray(final T dest, final int destOffset, final Class<T> destClazz, final int newLength) {
@SuppressWarnings("unchecked")
final T result;
if (dest == null)
result = (T) Array.newInstance(destClazz.getComponentType(), newLength + destOffset);
else if (Array.getLength(dest) < newLength + destOffset) {
result = (T) Array.newInstance(destClazz.getComponentType(),newLength + destOffset);
if (destOffset > 0) System.arraycopy(dest,0,result,0,destOffset);
} else
result = dest;
return result;
}
private static <T> Class<T> skimArrayDetermineDestArrayClass(final T dest, Class<T> sourceClazz) {
final Class<T> destClazz;
if (dest == null)
destClazz = sourceClazz;
else {
destClazz = (Class<T>) dest.getClass();
if (destClazz != sourceClazz) {
if (!destClazz.isArray())
throw new IllegalArgumentException("the destination array class must be an array");
if (sourceClazz.getComponentType().isAssignableFrom(destClazz.getComponentType()))
throw new IllegalArgumentException("the provided destination array class cannot contain values from the source due to type incompatibility");
}
}
return destClazz;
}
/**
* Makes a deep clone of the array provided.
*
* <p>
* When you can use {@link Arrays#copyOf} or an array {@link Object#clone()} to create a copy of itself,
* if it is multi-dimentional each sub array or matrix would be cloned.
* </p>
*
* <p>
* Notice however that if the base type is an Object type, the base elements themselves wont be cloned.
* </p>
*
* @param array the array to deep-clone.
* @param <T> type of the array.
*
* @throws IllegalArgumentException if {@code array} is {@code null} or is not an array.
*/
public static <T> T deepCloneArray(final T array) {
if (array == null)
throw new IllegalArgumentException("");
@SuppressWarnings("unchecked")
final Class<T> clazz = (Class<T>) array.getClass();
if (!clazz.isArray())
throw new IllegalArgumentException("the input is not an array");
final int dimension = calculateArrayDimensions(clazz);
return deepCloneArrayUnchecked(array,clazz, dimension);
}
private static int calculateArrayDimensions(final Class<?> clazz) {
if (clazz.isArray())
return calculateArrayDimensions(clazz.getComponentType()) + 1;
else
return 0;
}
private static <T> T deepCloneArrayUnchecked(final T array, final Class<T> clazz, final int dimension) {
final int length = Array.getLength(array);
final Class componentClass = clazz.getComponentType();
final T result = (T) Array.newInstance(componentClass,length);
if (dimension <= 1) {
System.arraycopy(array, 0, result, 0, length);
return result;
}
final int dimensionMinus1 = dimension - 1;
for (int i = 0; i < length; i++)
Array.set(result,i,deepCloneArrayUnchecked(Array.get(array,i),componentClass,dimensionMinus1));
return result;
}
} }

View File

@ -26,14 +26,15 @@
package org.broadinstitute.gatk.utils.pairhmm; package org.broadinstitute.gatk.utils.pairhmm;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import htsjdk.variant.variantcontext.Allele;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.MathUtils;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import htsjdk.variant.variantcontext.Allele;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
/** /**
@ -128,6 +129,26 @@ public abstract class PairHMM {
initialize(readMaxLength, haplotypeMaxLength); initialize(readMaxLength, haplotypeMaxLength);
} }
private int findMaxReadLength(final GATKSAMRecord ... reads) {
int max = 0;
for (final GATKSAMRecord read : reads) {
final int readLength = read.getReadLength();
if (max < readLength)
max = readLength;
}
return max;
}
private int findMaxAlleleLength(final List<? extends Allele> alleles) {
int max = 0;
for (final Allele allele : alleles) {
final int alleleLength = allele.length();
if (max < alleleLength)
max = alleleLength;
}
return max;
}
protected int findMaxReadLength(final List<GATKSAMRecord> reads) { protected int findMaxReadLength(final List<GATKSAMRecord> reads) {
int listMaxReadLength = 0; int listMaxReadLength = 0;
for(GATKSAMRecord read : reads){ for(GATKSAMRecord read : reads){
@ -137,10 +158,9 @@ public abstract class PairHMM {
return listMaxReadLength; return listMaxReadLength;
} }
protected int findMaxHaplotypeLength(final Map<Allele, Haplotype> haplotypeMap) { protected int findMaxHaplotypeLength(final Collection<Haplotype> haplotypes) {
int listMaxHaplotypeLength = 0; int listMaxHaplotypeLength = 0;
for( final Allele a: haplotypeMap.keySet() ) { for( final Haplotype h : haplotypes) {
final Haplotype h = haplotypeMap.get(a);
final int haplotypeLength = h.getBases().length; final int haplotypeLength = h.getBases().length;
if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; }
} }
@ -151,71 +171,61 @@ public abstract class PairHMM {
* Given a list of reads and haplotypes, for every read compute the total probability of said read arising from * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from
* each haplotype given base substitution, insertion, and deletion probabilities. * each haplotype given base substitution, insertion, and deletion probabilities.
* *
* @param reads the list of reads * @param processedReads reads to analyze instead of the ones present in the destination read-likelihoods.
* @param alleleHaplotypeMap the list of haplotypes * @param likelihoods where to store the likelihoods where position [a][r] is reserved for the likelihood of {@code reads[r]}
* @param GCPArrayMap Each read is associated with an array containing the gap continuation penalties for use in the model. Length of each GCP-array must match that of its read. * conditional to {@code alleles[a]}.
* @return a PerReadAlleleLikelihoodMap containing each read, haplotype-allele, and the log10 probability of * @param gcp penalty for gap continuations base array map for processed reads.
* said read coming from the said haplotype under the provided error model *
* @throws IllegalArgumentException
*
* @return never {@code null}.
*/ */
public PerReadAlleleLikelihoodMap computeLikelihoods(final List<GATKSAMRecord> reads, final Map<Allele, Haplotype> alleleHaplotypeMap, final Map<GATKSAMRecord, byte[]> GCPArrayMap) { public void computeLikelihoods(final ReadLikelihoods.Matrix<Haplotype> likelihoods,
final List<GATKSAMRecord> processedReads,
final Map<GATKSAMRecord,byte[]> gcp) {
if (processedReads.isEmpty())
return;
if(doProfiling) if(doProfiling)
startTime = System.nanoTime(); startTime = System.nanoTime();
// (re)initialize the pairHMM only if necessary // (re)initialize the pairHMM only if necessary
final int readMaxLength = findMaxReadLength(reads); final int readMaxLength = findMaxReadLength(processedReads);
final int haplotypeMaxLength = findMaxHaplotypeLength(alleleHaplotypeMap); final int haplotypeMaxLength = findMaxAlleleLength(likelihoods.alleles());
if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) { initialize(readMaxLength, haplotypeMaxLength); } if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength)
initialize(readMaxLength, haplotypeMaxLength);
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); final int readCount = processedReads.size();
mLikelihoodArray = new double[reads.size()*alleleHaplotypeMap.size()]; final List<Haplotype> alleles = likelihoods.alleles();
final int alleleCount = alleles.size();
mLikelihoodArray = new double[readCount * alleleCount];
int idx = 0; int idx = 0;
for(GATKSAMRecord read : reads){ int readIndex = 0;
for(final GATKSAMRecord read : processedReads){
final byte[] readBases = read.getReadBases(); final byte[] readBases = read.getReadBases();
final byte[] readQuals = read.getBaseQualities(); final byte[] readQuals = read.getBaseQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] overallGCP = GCPArrayMap.get(read); final byte[] overallGCP = gcp.get(read);
// peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation)
byte[] currentHaplotypeBases = null; final boolean isFirstHaplotype = true;
boolean isFirstHaplotype = true; for (int a = 0; a < alleleCount; a++) {
Allele currentAllele = null; final Allele allele = alleles.get(a);
double log10l; final byte[] alleleBases = allele.getBases();
//for (final Allele allele : alleleHaplotypeMap.keySet()){ final byte[] nextAlleleBases = a == alleles.size() - 1 ? null : alleles.get(a + 1).getBases();
for (Map.Entry<Allele,Haplotype> currEntry : alleleHaplotypeMap.entrySet()){ final double lk = computeReadLikelihoodGivenHaplotypeLog10(alleleBases,
//final Haplotype haplotype = alleleHaplotypeMap.get(allele); readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextAlleleBases);
final Allele allele = currEntry.getKey(); likelihoods.set(a, readIndex, lk);
final Haplotype haplotype = currEntry.getValue(); mLikelihoodArray[idx++] = lk;
final byte[] nextHaplotypeBases = haplotype.getBases();
if (currentHaplotypeBases != null) {
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextHaplotypeBases);
mLikelihoodArray[idx++] = log10l;
likelihoodMap.add(read, currentAllele, log10l);
} }
// update the current haplotype readIndex++;
currentHaplotypeBases = nextHaplotypeBases;
currentAllele = allele;
} }
// process the final haplotype if(doProfiling) {
if (currentHaplotypeBases != null) {
// there is no next haplotype, so pass null for nextHaplotypeBases.
log10l = computeReadLikelihoodGivenHaplotypeLog10(currentHaplotypeBases,
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, null);
likelihoodMap.add(read, currentAllele, log10l);
mLikelihoodArray[idx++] = log10l;
}
}
if(doProfiling)
{
threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime); threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime);
//synchronized(doProfiling) //synchronized(doProfiling)
{ {
pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff; pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff;
} }
} }
return likelihoodMap;
} }
/** /**

View File

@ -31,7 +31,6 @@ import htsjdk.samtools.Cigar;
import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator; import htsjdk.samtools.CigarOperator;
import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecord;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.BaseUtils;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.haplotype.Haplotype;
@ -75,7 +74,11 @@ public final class AlignmentUtils {
* @param haplotype the haplotype that the read should be aligned to, before aligning to the reference * @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
* @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame. * @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame.
* @param isInformative true if the read is differentially informative for one of the haplotypes * @param isInformative true if the read is differentially informative for one of the haplotypes
* @return a GATKSAMRecord aligned to reference, or null if no meaningful alignment is possible *
* @throws IllegalArgumentException if {@code originalRead} is {@code null} or {@code haplotype} is {@code null} or it
* does not have a Cigar or the {@code referenceStart} is invalid (less than 1).
*
* @return a GATKSAMRecord aligned to reference. Never {@code null}.
*/ */
public static GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead, public static GATKSAMRecord createReadAlignedToRef(final GATKSAMRecord originalRead,
final Haplotype haplotype, final Haplotype haplotype,

View File

@ -28,7 +28,6 @@ package org.broadinstitute.gatk.utils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.broadinstitute.gatk.utils.io.IOUtils; import org.broadinstitute.gatk.utils.io.IOUtils;
import org.testng.Assert; import org.testng.Assert;
import org.broadinstitute.gatk.utils.BaseTest;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
@ -255,4 +254,37 @@ public class UtilsUnitTest extends BaseTest {
}; };
} }
@Test(dataProvider = "skimArrayData")
public void testSkimArray(final String original, final String remove) {
final StringBuilder resultBuilder = new StringBuilder();
final boolean[] removeBoolean = new boolean[remove.length()];
for (int i = 0; i < original.length(); i++)
if (remove.charAt(i) == '1') {
resultBuilder.append(original.charAt(i));
removeBoolean[i] = false;
} else
removeBoolean[i] = true;
final String expected = resultBuilder.toString();
final byte[] resultBytes = Utils.skimArray(original.getBytes(),removeBoolean);
final String resultString = new String(resultBytes);
Assert.assertEquals(resultString,expected);
}
@DataProvider(name = "skimArrayData")
public Object[][] skimArrayData() {
return new Object[][] {
{"romeo+juliette" , "11111111111111" },
{"romeo+juliette" , "11111011111111" },
{"romeo+juliette" , "00000011111111" },
{"romeo+juliette" , "11111100000000" },
{"romeo+juliette" , "11111011111111" },
{"romeo+juliette" , "01111010000001" },
{"romeo+juliette" , "01100110000110" },
{"romeo+juliette" , "10101010101010" },
{"romeo+juliette" , "01010101010101" },
{"romeo+juliette" , "01111010111001" },
};
}
} }