Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
9333b678b5
|
|
@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -51,6 +52,7 @@ import java.io.File;
|
|||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Method;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
|
||||
/**
|
||||
* User: aaron
|
||||
|
|
@ -204,7 +206,7 @@ public class SAMDataSource {
|
|||
BAQ.QualityMode.DONT_MODIFY,
|
||||
null, // no BAQ
|
||||
(byte) -1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new SAM data source given the supplied read metadata.
|
||||
|
|
@ -258,17 +260,15 @@ public class SAMDataSource {
|
|||
if(readBufferSize != null)
|
||||
ReadShard.setReadBufferSize(readBufferSize);
|
||||
|
||||
for (SAMReaderID readerID : samFiles) {
|
||||
if (!readerID.samFile.canRead())
|
||||
throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " +
|
||||
"Please check that the file is present and readable and try again.");
|
||||
}
|
||||
|
||||
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
// Determine the sort order.
|
||||
for(SAMReaderID readerID: readerIDs) {
|
||||
if (! readerID.samFile.canRead() )
|
||||
throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " +
|
||||
"Please check that the file is present and readable and try again.");
|
||||
|
||||
// Get the sort order, forcing it to coordinate if unsorted.
|
||||
SAMFileReader reader = readers.getReader(readerID);
|
||||
SAMFileHeader header = reader.getFileHeader();
|
||||
|
|
@ -714,29 +714,68 @@ public class SAMDataSource {
|
|||
* @param validationStringency validation stringency.
|
||||
*/
|
||||
public SAMReaders(Collection<SAMReaderID> readerIDs, SAMFileReader.ValidationStringency validationStringency) {
|
||||
final int N_THREADS = 8;
|
||||
int totalNumberOfFiles = readerIDs.size();
|
||||
int readerNumber = 1;
|
||||
for(SAMReaderID readerID: readerIDs) {
|
||||
File indexFile = findIndexFile(readerID.samFile);
|
||||
|
||||
SAMFileReader reader = null;
|
||||
|
||||
if(threadAllocation.getNumIOThreads() > 0) {
|
||||
BlockInputStream blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
||||
reader = new SAMFileReader(blockInputStream,indexFile,false);
|
||||
inputStreams.put(readerID,blockInputStream);
|
||||
}
|
||||
else
|
||||
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||
reader.setSAMRecordFactory(factory);
|
||||
|
||||
reader.enableFileSource(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
|
||||
logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile));
|
||||
|
||||
readers.put(readerID,reader);
|
||||
ExecutorService executor = Executors.newFixedThreadPool(N_THREADS);
|
||||
final List<ReaderInitializer> inits = new ArrayList<ReaderInitializer>(totalNumberOfFiles);
|
||||
Queue<Future<ReaderInitializer>> futures = new LinkedList<Future<ReaderInitializer>>();
|
||||
for (SAMReaderID readerID: readerIDs) {
|
||||
logger.debug("Enqueuing for initialization: " + readerID.samFile);
|
||||
final ReaderInitializer init = new ReaderInitializer(readerID);
|
||||
inits.add(init);
|
||||
futures.add(executor.submit(init));
|
||||
}
|
||||
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
try {
|
||||
final int MAX_WAIT = 30 * 1000;
|
||||
final int MIN_WAIT = 1 * 1000;
|
||||
|
||||
timer.start();
|
||||
while ( ! futures.isEmpty() ) {
|
||||
final int prevSize = futures.size();
|
||||
final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file
|
||||
final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT));
|
||||
Thread.sleep(waitTimeInMS);
|
||||
|
||||
Queue<Future<ReaderInitializer>> pending = new LinkedList<Future<ReaderInitializer>>();
|
||||
for ( final Future<ReaderInitializer> initFuture : futures ) {
|
||||
if ( initFuture.isDone() ) {
|
||||
final ReaderInitializer init = initFuture.get();
|
||||
if (threadAllocation.getNumIOThreads() > 0) {
|
||||
inputStreams.put(init.readerID, init.blockInputStream); // get from initializer
|
||||
}
|
||||
logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID));
|
||||
readers.put(init.readerID, init.reader);
|
||||
} else {
|
||||
pending.add(initFuture);
|
||||
}
|
||||
}
|
||||
|
||||
final int pendingSize = pending.size();
|
||||
final int nExecutedInTick = prevSize - pendingSize;
|
||||
final int nExecutedTotal = totalNumberOfFiles - pendingSize;
|
||||
final double totalTimeInSeconds = timer.getElapsedTime();
|
||||
final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds);
|
||||
final int nRemaining = pendingSize;
|
||||
final double estTimeToComplete = pendingSize / nTasksPerSecond;
|
||||
logger.info(String.format("Init %d BAMs in last %d s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m",
|
||||
nExecutedInTick, (int)(waitTimeInMS / 1000.0),
|
||||
nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond,
|
||||
nRemaining, estTimeToComplete, estTimeToComplete / 60));
|
||||
|
||||
futures = pending;
|
||||
}
|
||||
} catch ( InterruptedException e ) {
|
||||
throw new ReviewedStingException("Interrupted SAMReader initialization", e);
|
||||
} catch ( ExecutionException e ) {
|
||||
throw new ReviewedStingException("Execution exception during SAMReader initialization", e);
|
||||
}
|
||||
|
||||
logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime()));
|
||||
executor.shutdown();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -809,6 +848,30 @@ public class SAMDataSource {
|
|||
}
|
||||
}
|
||||
|
||||
class ReaderInitializer implements Callable<ReaderInitializer> {
|
||||
final SAMReaderID readerID;
|
||||
BlockInputStream blockInputStream = null;
|
||||
SAMFileReader reader;
|
||||
|
||||
public ReaderInitializer(final SAMReaderID readerID) {
|
||||
this.readerID = readerID;
|
||||
}
|
||||
|
||||
public ReaderInitializer call() {
|
||||
final File indexFile = findIndexFile(readerID.samFile);
|
||||
if (threadAllocation.getNumIOThreads() > 0) {
|
||||
blockInputStream = new BlockInputStream(dispatcher,readerID,false);
|
||||
reader = new SAMFileReader(blockInputStream,indexFile,false);
|
||||
}
|
||||
else
|
||||
reader = new SAMFileReader(readerID.samFile,indexFile,false);
|
||||
reader.setSAMRecordFactory(factory);
|
||||
reader.enableFileSource(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
private class ReleasingIterator implements StingSAMIterator {
|
||||
/**
|
||||
* The resource acting as the source of the data.
|
||||
|
|
@ -991,12 +1054,12 @@ public class SAMDataSource {
|
|||
return
|
||||
// Read ends on a later contig, or...
|
||||
read.getReferenceIndex() > intervalContigIndices[currentBound] ||
|
||||
// Read ends of this contig...
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||
// either after this location, or...
|
||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||
// Read ends of this contig...
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] &&
|
||||
// either after this location, or...
|
||||
(read.getAlignmentEnd() >= intervalStarts[currentBound] ||
|
||||
// read is unmapped but positioned and alignment start is on or after this start point.
|
||||
(read.getReadUnmappedFlag() && read.getAlignmentStart() >= intervalStarts[currentBound])));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -1008,8 +1071,8 @@ public class SAMDataSource {
|
|||
return
|
||||
// Read starts on a prior contig, or...
|
||||
read.getReferenceIndex() < intervalContigIndices[currentBound] ||
|
||||
// Read starts on this contig and the alignment start is registered before this end point.
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||
// Read starts on this contig and the alignment start is registered before this end point.
|
||||
(read.getReferenceIndex() == intervalContigIndices[currentBound] && read.getAlignmentStart() <= intervalEnds[currentBound]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
|
@ -65,11 +64,9 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable {
|
|||
* @param GLs genotype likelihoods
|
||||
* @param Alleles Alleles corresponding to GLs
|
||||
* @param log10AlleleFrequencyPriors priors
|
||||
* @param log10AlleleFrequencyLikelihoods array (pre-allocated) to store likelihoods results
|
||||
* @param log10AlleleFrequencyPosteriors array (pre-allocated) to store posteriors results
|
||||
* @param result (pre-allocated) object to store likelihoods results
|
||||
*/
|
||||
protected abstract void getLog10PNonRef(GenotypesContext GLs, List<Allele> Alleles,
|
||||
double[][] log10AlleleFrequencyPriors,
|
||||
double[][] log10AlleleFrequencyLikelihoods,
|
||||
double[][] log10AlleleFrequencyPosteriors);
|
||||
AlleleFrequencyCalculationResult result);
|
||||
}
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Dec 14, 2011
|
||||
*
|
||||
* Useful helper class to communicate the results of the allele frequency calculation
|
||||
*/
|
||||
public class AlleleFrequencyCalculationResult {
|
||||
|
||||
final double[][] log10AlleleFrequencyLikelihoods;
|
||||
final double[][] log10AlleleFrequencyPosteriors;
|
||||
|
||||
AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) {
|
||||
log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1];
|
||||
log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1];
|
||||
}
|
||||
}
|
||||
|
|
@ -47,16 +47,15 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
public void getLog10PNonRef(final GenotypesContext GLs,
|
||||
final List<Allele> alleles,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final double[][] log10AlleleFrequencyLikelihoods,
|
||||
final double[][] log10AlleleFrequencyPosteriors) {
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final int numAlleles = alleles.size();
|
||||
|
||||
//linearExact(GLs, log10AlleleFrequencyPriors[0], log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false);
|
||||
linearExactMultiAllelic(GLs, numAlleles - 1, log10AlleleFrequencyPriors, result, false);
|
||||
}
|
||||
|
||||
private static final ArrayList<double[]> getGLs(GenotypesContext GLs) {
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(); // TODO -- initialize with size of GLs
|
||||
ArrayList<double[]> genotypeLikelihoods = new ArrayList<double[]>(GLs.size());
|
||||
|
||||
genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy
|
||||
for ( Genotype sample : GLs.iterateInSampleNameOrder() ) {
|
||||
|
|
@ -196,10 +195,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
public static void linearExactMultiAllelic(final GenotypesContext GLs,
|
||||
final int numAlternateAlleles,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final double[][] log10AlleleFrequencyLikelihoods,
|
||||
final double[][] log10AlleleFrequencyPosteriors,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final boolean preserveData) {
|
||||
|
||||
// make sure the PL cache has been initialized
|
||||
if ( UnifiedGenotyperEngine.PLIndexToAlleleIndex == null )
|
||||
UnifiedGenotyperEngine.calculatePLcache(5);
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final int numSamples = genotypeLikelihoods.size()-1;
|
||||
final int numChr = 2*numSamples;
|
||||
|
|
@ -221,7 +223,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset set = ACqueue.remove();
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, preserveData, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
|
|
@ -236,14 +238,13 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
final Queue<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final double[][] log10AlleleFrequencyLikelihoods,
|
||||
final double[][] log10AlleleFrequencyPosteriors) {
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
if ( DEBUG )
|
||||
System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts);
|
||||
|
||||
// compute the log10Likelihoods
|
||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors);
|
||||
computeLofK(set, genotypeLikelihoods, indexesToACset, log10AlleleFrequencyPriors, result);
|
||||
|
||||
// clean up memory
|
||||
if ( !preserveData ) {
|
||||
|
|
@ -349,8 +350,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
final ArrayList<double[]> genotypeLikelihoods,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset,
|
||||
final double[][] log10AlleleFrequencyPriors,
|
||||
final double[][] log10AlleleFrequencyLikelihoods,
|
||||
final double[][] log10AlleleFrequencyPosteriors) {
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
set.log10Likelihoods[0] = 0.0; // the zero case
|
||||
final int totalK = set.getACsum();
|
||||
|
|
@ -410,19 +410,15 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
// update the likelihoods/posteriors vectors which are collapsed views of each of the various ACs
|
||||
for ( int i = 0; i < set.ACcounts.getCounts().length; i++ ) {
|
||||
int AC = set.ACcounts.getCounts()[i];
|
||||
log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
|
||||
result.log10AlleleFrequencyLikelihoods[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyLikelihoods[i][AC], log10LofK);
|
||||
|
||||
// for k=0 we still want to use theta
|
||||
final double prior = (nonRefAlleles == 0) ? log10AlleleFrequencyPriors[0][0] : log10AlleleFrequencyPriors[nonRefAlleles-1][AC];
|
||||
log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
|
||||
result.log10AlleleFrequencyPosteriors[i][AC] = approximateLog10SumLog10(result.log10AlleleFrequencyPosteriors[i][AC], log10LofK + prior);
|
||||
}
|
||||
}
|
||||
|
||||
private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) {
|
||||
// todo -- arent' there a small number of fixed values that this function can adopt?
|
||||
// todo -- at a minimum it'd be good to partially compute some of these in ACCounts for performance
|
||||
// todo -- need to cache PLIndex -> two alleles, compute looping over each PLIndex. Note all other operations are efficient
|
||||
// todo -- this can be computed once at the start of the all operations
|
||||
|
||||
// the closed form representation generalized for multiple alleles is as follows:
|
||||
// AA: (2j - totalK) * (2j - totalK - 1)
|
||||
|
|
@ -438,25 +434,19 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
|||
if ( PLindex <= numAltAlleles )
|
||||
return MathUtils.log10Cache[2*ACcounts[PLindex-1]] + MathUtils.log10Cache[2*j-totalK];
|
||||
|
||||
int subtractor = numAltAlleles+1;
|
||||
int subtractions = 0;
|
||||
do {
|
||||
PLindex -= subtractor;
|
||||
subtractor--;
|
||||
subtractions++;
|
||||
}
|
||||
while ( PLindex >= subtractor );
|
||||
// find the 2 alternate alleles that are represented by this PL index
|
||||
int[] alleles = UnifiedGenotyperEngine.PLIndexToAlleleIndex[numAltAlleles][PLindex];
|
||||
|
||||
final int k_i = ACcounts[subtractions-1];
|
||||
final int k_i = ACcounts[alleles[0]-1]; // subtract one because ACcounts doesn't consider the reference allele
|
||||
|
||||
// the hom var case (e.g. BB, CC, DD)
|
||||
final double coeff;
|
||||
if ( PLindex == 0 ) {
|
||||
if ( alleles[0] == alleles[1] ) {
|
||||
coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1];
|
||||
}
|
||||
// the het non-ref case (e.g. BC, BD, CD)
|
||||
else {
|
||||
final int k_j = ACcounts[subtractions+PLindex-1];
|
||||
final int k_j = ACcounts[alleles[1]-1];
|
||||
coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j];
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
|
|
@ -95,7 +96,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
determineAlternateAlleles(basesToUse, refBase, contexts, useBAQedPileup);
|
||||
|
||||
// how many alternate alleles are we using?
|
||||
int alleleCounter = countSetBits(basesToUse);
|
||||
int alleleCounter = Utils.countSetBits(basesToUse);
|
||||
|
||||
// if there are no non-ref alleles...
|
||||
if ( alleleCounter == 0 ) {
|
||||
|
|
@ -109,7 +110,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
}
|
||||
|
||||
// create the alternate alleles and the allele ordering (the ordering is crucial for the GLs)
|
||||
final int numAltAlleles = countSetBits(basesToUse);
|
||||
final int numAltAlleles = Utils.countSetBits(basesToUse);
|
||||
final int[] alleleOrdering = new int[numAltAlleles + 1];
|
||||
alleleOrdering[0] = indexOfRefBase;
|
||||
int alleleOrderingIndex = 1;
|
||||
|
|
@ -161,15 +162,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
|
|||
return builder.genotypes(genotypes).make();
|
||||
}
|
||||
|
||||
private int countSetBits(boolean[] array) {
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < array.length; i++ ) {
|
||||
if ( array[i] )
|
||||
counter++;
|
||||
}
|
||||
return counter;
|
||||
}
|
||||
|
||||
// fills in the allelesToUse array
|
||||
protected void determineAlternateAlleles(boolean[] allelesToUse, byte ref, Map<String, AlignmentContext> contexts, boolean useBAQedPileup) {
|
||||
int[] qualCounts = new int[4];
|
||||
|
|
|
|||
|
|
@ -75,14 +75,13 @@ public class UnifiedGenotyperEngine {
|
|||
// the model used for calculating p(non-ref)
|
||||
private ThreadLocal<AlleleFrequencyCalculationModel> afcm = new ThreadLocal<AlleleFrequencyCalculationModel>();
|
||||
|
||||
// the allele frequency likelihoods (allocated once as an optimization)
|
||||
private ThreadLocal<AlleleFrequencyCalculationResult> alleleFrequencyCalculationResult = new ThreadLocal<AlleleFrequencyCalculationResult>();
|
||||
|
||||
// because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything
|
||||
private final double[][] log10AlleleFrequencyPriorsSNPs;
|
||||
private final double[][] log10AlleleFrequencyPriorsIndels;
|
||||
|
||||
// the allele frequency likelihoods (allocated once as an optimization)
|
||||
private ThreadLocal<double[][]> log10AlleleFrequencyLikelihoods = new ThreadLocal<double[][]>();
|
||||
private ThreadLocal<double[][]> log10AlleleFrequencyPosteriors = new ThreadLocal<double[][]>();
|
||||
|
||||
// the priors object
|
||||
private final GenotypePriors genotypePriorsSNPs;
|
||||
private final GenotypePriors genotypePriorsIndels;
|
||||
|
|
@ -103,6 +102,9 @@ public class UnifiedGenotyperEngine {
|
|||
private final GenomeLocParser genomeLocParser;
|
||||
private final boolean BAQEnabledOnCMDLine;
|
||||
|
||||
// a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
|
||||
// the representation is int[number of alternate alleles][PL index][pair of allele indexes (where reference = 0)]
|
||||
protected static int[][][] PLIndexToAlleleIndex;
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -136,6 +138,27 @@ public class UnifiedGenotyperEngine {
|
|||
genotypePriorsIndels = createGenotypePriors(GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
|
||||
filter.add(LOW_QUAL_FILTER_NAME);
|
||||
calculatePLcache(UAC.MAX_ALTERNATE_ALLELES);
|
||||
}
|
||||
|
||||
protected static void calculatePLcache(int maxAltAlleles) {
|
||||
PLIndexToAlleleIndex = new int[maxAltAlleles+1][][];
|
||||
PLIndexToAlleleIndex[0] = new int[][]{ new int[]{0, 0} };
|
||||
int numLikelihoods = 1;
|
||||
|
||||
// for each count of alternate alleles
|
||||
for ( int altAlleles = 1; altAlleles <= maxAltAlleles; altAlleles++ ) {
|
||||
numLikelihoods += altAlleles + 1;
|
||||
PLIndexToAlleleIndex[altAlleles] = new int[numLikelihoods][];
|
||||
int PLindex = 0;
|
||||
|
||||
// for all possible combinations of the 2 alt alleles
|
||||
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
|
||||
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
|
||||
PLIndexToAlleleIndex[altAlleles][PLindex++] = new int[]{ allele1, allele2 };
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -264,9 +287,8 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// initialize the data for this thread if that hasn't been done yet
|
||||
if ( afcm.get() == null ) {
|
||||
log10AlleleFrequencyLikelihoods.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
|
||||
log10AlleleFrequencyPosteriors.set(new double[UAC.MAX_ALTERNATE_ALLELES][N+1]);
|
||||
afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC));
|
||||
alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES, N));
|
||||
}
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
|
|
@ -285,9 +307,9 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position)
|
||||
clearAFarray(log10AlleleFrequencyLikelihoods.get());
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
|
||||
// is the most likely frequency conformation AC=0 for all alternate alleles?
|
||||
boolean bestGuessIsRef = true;
|
||||
|
|
@ -299,7 +321,7 @@ public class UnifiedGenotyperEngine {
|
|||
// determine which alternate alleles have AF>0
|
||||
boolean[] altAllelesToUse = new boolean[vc.getAlternateAlleles().size()];
|
||||
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
|
||||
int indexOfBestAC = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()[i]);
|
||||
int indexOfBestAC = MathUtils.maxElementIndex(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]);
|
||||
|
||||
// if the most likely AC is not 0, then this is a good alternate allele to use
|
||||
if ( indexOfBestAC != 0 ) {
|
||||
|
|
@ -320,7 +342,7 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// calculate p(f>0)
|
||||
// TODO -- right now we just calculate it for the alt allele with highest AF, but the likelihoods need to be combined correctly over all AFs
|
||||
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()[indexOfHighestAlt]);
|
||||
double[] normalizedPosteriors = MathUtils.normalizeFromLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[indexOfHighestAlt]);
|
||||
double sum = 0.0;
|
||||
for (int i = 1; i <= N; i++)
|
||||
sum += normalizedPosteriors[i];
|
||||
|
|
@ -330,15 +352,15 @@ public class UnifiedGenotyperEngine {
|
|||
if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]);
|
||||
if ( Double.isInfinite(phredScaledConfidence) )
|
||||
phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0][0];
|
||||
phredScaledConfidence = -10.0 * alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
|
||||
} else {
|
||||
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
|
||||
if ( Double.isInfinite(phredScaledConfidence) ) {
|
||||
sum = 0.0;
|
||||
for (int i = 1; i <= N; i++) {
|
||||
if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED )
|
||||
break;
|
||||
sum += log10AlleleFrequencyPosteriors.get()[0][i];
|
||||
sum += alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i];
|
||||
}
|
||||
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
|
||||
}
|
||||
|
|
@ -378,7 +400,7 @@ public class UnifiedGenotyperEngine {
|
|||
}
|
||||
|
||||
// create the genotypes
|
||||
GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse);
|
||||
GenotypesContext genotypes = assignGenotypes(vc, altAllelesToUse, myAlleles);
|
||||
|
||||
// print out stats if we have a writer
|
||||
if ( verboseWriter != null && !limitedContext )
|
||||
|
|
@ -396,31 +418,31 @@ public class UnifiedGenotyperEngine {
|
|||
|
||||
// the overall lod
|
||||
VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model);
|
||||
clearAFarray(log10AlleleFrequencyLikelihoods.get());
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
//double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0];
|
||||
double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
|
||||
double overallLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
|
||||
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
|
||||
|
||||
// the forward lod
|
||||
VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model);
|
||||
clearAFarray(log10AlleleFrequencyLikelihoods.get());
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0];
|
||||
double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
|
||||
double forwardLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
|
||||
double forwardLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
|
||||
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
|
||||
|
||||
// the reverse lod
|
||||
VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model);
|
||||
clearAFarray(log10AlleleFrequencyLikelihoods.get());
|
||||
clearAFarray(log10AlleleFrequencyPosteriors.get());
|
||||
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyLikelihoods.get(), log10AlleleFrequencyPosteriors.get());
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyLikelihoods);
|
||||
clearAFarray(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors);
|
||||
afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), alleleFrequencyCalculationResult.get());
|
||||
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true);
|
||||
double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0][0];
|
||||
double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get()[0], 1);
|
||||
double reverseLog10PofNull = alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][0];
|
||||
double reverseLog10PofF = MathUtils.log10sumLog10(alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0], 1);
|
||||
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
|
||||
|
||||
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
|
||||
|
|
@ -587,10 +609,10 @@ public class UnifiedGenotyperEngine {
|
|||
AFline.append(i + "/" + N + "\t");
|
||||
AFline.append(String.format("%.2f\t", ((float)i)/N));
|
||||
AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i]));
|
||||
if ( log10AlleleFrequencyPosteriors.get()[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
|
||||
if ( alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[0][i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED)
|
||||
AFline.append("0.00000000\t");
|
||||
else
|
||||
AFline.append(String.format("%.8f\t", log10AlleleFrequencyPosteriors.get()[i]));
|
||||
AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().log10AlleleFrequencyPosteriors[i]));
|
||||
AFline.append(String.format("%.8f\t", normalizedPosteriors[i]));
|
||||
verboseWriter.println(AFline.toString());
|
||||
}
|
||||
|
|
@ -750,82 +772,85 @@ public class UnifiedGenotyperEngine {
|
|||
/**
|
||||
* @param vc variant context with genotype likelihoods
|
||||
* @param allelesToUse bit vector describing which alternate alleles from the vc are okay to use
|
||||
* @param newAlleles a list of the final new alleles to use
|
||||
*
|
||||
* @return genotypes
|
||||
*/
|
||||
public GenotypesContext assignGenotypes(VariantContext vc,
|
||||
boolean[] allelesToUse) {
|
||||
public GenotypesContext assignGenotypes(final VariantContext vc,
|
||||
final boolean[] allelesToUse,
|
||||
final List<Allele> newAlleles) {
|
||||
|
||||
// the no-called genotypes
|
||||
final GenotypesContext GLs = vc.getGenotypes();
|
||||
|
||||
// samples
|
||||
final List<String> sampleIndices = GLs.getSampleNamesOrderedByName();
|
||||
|
||||
// the new called genotypes to create
|
||||
final GenotypesContext calls = GenotypesContext.create();
|
||||
|
||||
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
|
||||
final int numOriginalAltAlleles = allelesToUse.length;
|
||||
final int numNewAltAlleles = newAlleles.size() - 1;
|
||||
ArrayList<Integer> likelihoodIndexesToUse = null;
|
||||
|
||||
// an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles,
|
||||
// then we can keep the PLs as is; otherwise, we determine which ones to keep
|
||||
if ( numNewAltAlleles != numOriginalAltAlleles && numNewAltAlleles > 0 ) {
|
||||
likelihoodIndexesToUse = new ArrayList<Integer>(30);
|
||||
final int[][] PLcache = PLIndexToAlleleIndex[numOriginalAltAlleles];
|
||||
|
||||
for ( int PLindex = 0; PLindex < PLcache.length; PLindex++ ) {
|
||||
int[] alleles = PLcache[PLindex];
|
||||
// consider this entry only if both of the alleles are good
|
||||
if ( (alleles[0] == 0 || allelesToUse[alleles[0] - 1]) && (alleles[1] == 0 || allelesToUse[alleles[1] - 1]) )
|
||||
likelihoodIndexesToUse.add(PLindex);
|
||||
}
|
||||
}
|
||||
|
||||
// create the new genotypes
|
||||
for ( int k = GLs.size() - 1; k >= 0; k-- ) {
|
||||
final String sample = sampleIndices.get(k);
|
||||
final Genotype g = GLs.get(sample);
|
||||
if ( !g.hasLikelihoods() )
|
||||
continue;
|
||||
|
||||
final double[] likelihoods = g.getLikelihoods().getAsVector();
|
||||
// create the new likelihoods array from the alleles we are allowed to use
|
||||
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
|
||||
double[] newLikelihoods;
|
||||
if ( likelihoodIndexesToUse == null ) {
|
||||
newLikelihoods = originalLikelihoods;
|
||||
} else {
|
||||
newLikelihoods = new double[likelihoodIndexesToUse.size()];
|
||||
int newIndex = 0;
|
||||
for ( int oldIndex : likelihoodIndexesToUse )
|
||||
newLikelihoods[newIndex++] = originalLikelihoods[oldIndex];
|
||||
|
||||
// if there is no mass on the likelihoods, then just no-call the sample
|
||||
if ( MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL ) {
|
||||
// might need to re-normalize
|
||||
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
|
||||
}
|
||||
|
||||
// if there is no mass on the (new) likelihoods and we actually have alternate alleles, then just no-call the sample
|
||||
if ( MathUtils.sum(newLikelihoods) > SUM_GL_THRESH_NOCALL ) {
|
||||
calls.add(new Genotype(g.getSampleName(), NO_CALL_ALLELES, Genotype.NO_LOG10_PERROR, null, null, false));
|
||||
continue;
|
||||
}
|
||||
|
||||
// genotype likelihoods are a linear vector that can be thought of as a row-wise upper triangular matrix of log10Likelihoods.
|
||||
// so e.g. with 2 alt alleles the likelihoods are AA,AB,AC,BB,BC,CC and with 3 alt alleles they are AA,AB,AC,AD,BB,BC,BD,CC,CD,DD.
|
||||
|
||||
final int numAltAlleles = allelesToUse.length;
|
||||
|
||||
// start with the assumption that the ideal genotype is homozygous reference
|
||||
Allele maxAllele1 = vc.getReference(), maxAllele2 = vc.getReference();
|
||||
double maxLikelihoodSeen = likelihoods[0];
|
||||
int indexOfMax = 0;
|
||||
|
||||
// keep track of some state
|
||||
Allele firstAllele = vc.getReference();
|
||||
int subtractor = numAltAlleles + 1;
|
||||
int subtractionsMade = 0;
|
||||
|
||||
for ( int i = 1, PLindex = 1; i < likelihoods.length; i++, PLindex++ ) {
|
||||
if ( PLindex == subtractor ) {
|
||||
firstAllele = vc.getAlternateAllele(subtractionsMade);
|
||||
PLindex -= subtractor;
|
||||
subtractor--;
|
||||
subtractionsMade++;
|
||||
|
||||
// we can skip this allele if it's not usable
|
||||
if ( !allelesToUse[subtractionsMade-1] ) {
|
||||
i += subtractor - 1;
|
||||
PLindex += subtractor - 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// we don't care about the entry if we've already seen better
|
||||
if ( likelihoods[i] <= maxLikelihoodSeen )
|
||||
continue;
|
||||
|
||||
// if it's usable then update the alleles
|
||||
int alleleIndex = subtractionsMade + PLindex - 1;
|
||||
if ( allelesToUse[alleleIndex] ) {
|
||||
maxAllele1 = firstAllele;
|
||||
maxAllele2 = vc.getAlternateAllele(alleleIndex);
|
||||
maxLikelihoodSeen = likelihoods[i];
|
||||
indexOfMax = i;
|
||||
}
|
||||
}
|
||||
// find the genotype with maximum likelihoods
|
||||
int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
int[] alleles = PLIndexToAlleleIndex[numNewAltAlleles][PLindex];
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
myAlleles.add(maxAllele1);
|
||||
myAlleles.add(maxAllele2);
|
||||
myAlleles.add(newAlleles.get(alleles[0]));
|
||||
myAlleles.add(newAlleles.get(alleles[1]));
|
||||
|
||||
final double qual = GenotypeLikelihoods.getQualFromLikelihoods(indexOfMax, likelihoods);
|
||||
calls.add(new Genotype(sample, myAlleles, qual, null, g.getAttributes(), false));
|
||||
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
|
||||
Map<String, Object> attrs = new HashMap<String, Object>(g.getAttributes());
|
||||
if ( numNewAltAlleles == 0 )
|
||||
attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY);
|
||||
else
|
||||
attrs.put(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(newLikelihoods));
|
||||
calls.add(new Genotype(sample, myAlleles, qual, null, attrs, false));
|
||||
}
|
||||
|
||||
return calls;
|
||||
|
|
|
|||
|
|
@ -182,6 +182,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
|
|||
nHomDerived++;
|
||||
}
|
||||
|
||||
break;
|
||||
case MIXED:
|
||||
break;
|
||||
default:
|
||||
throw new ReviewedStingException("BUG: Unexpected genotype type: " + g);
|
||||
|
|
|
|||
|
|
@ -388,6 +388,15 @@ public class Utils {
|
|||
return reallocate(pos, z);
|
||||
}
|
||||
|
||||
public static int countSetBits(boolean[] array) {
|
||||
int counter = 0;
|
||||
for ( int i = 0; i < array.length; i++ ) {
|
||||
if ( array[i] )
|
||||
counter++;
|
||||
}
|
||||
return counter;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns new (reallocated) integer array of the specified size, with content
|
||||
* of the original array <code>orig</code> copied into it. If <code>newSize</code> is
|
||||
|
|
|
|||
|
|
@ -168,7 +168,14 @@ public class ReadClipper {
|
|||
try {
|
||||
GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone();
|
||||
for (ClippingOp op : getOps()) {
|
||||
clippedRead = op.apply(algorithm, clippedRead);
|
||||
//check if the clipped read can still be clipped in the range requested
|
||||
if (op.start < clippedRead.getReadLength()) {
|
||||
ClippingOp fixedOperation = op;
|
||||
if (op.stop > clippedRead.getReadLength())
|
||||
fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1);
|
||||
|
||||
clippedRead = fixedOperation.apply(algorithm, clippedRead);
|
||||
}
|
||||
}
|
||||
wasClipped = true;
|
||||
ops.clear();
|
||||
|
|
|
|||
|
|
@ -184,7 +184,6 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
* @return a feature, (not guaranteed complete) that has the correct start and stop
|
||||
*/
|
||||
public Feature decodeLoc(String line) {
|
||||
lineNo++;
|
||||
|
||||
// the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
|
||||
if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
|
||||
|
|
@ -279,6 +278,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
builder.source(getName());
|
||||
|
||||
// increment the line count
|
||||
// TODO -- because of the way the engine utilizes Tribble, we can parse a line multiple times (especially when
|
||||
// TODO -- the first record is far along the contig) and the line counter can get out of sync
|
||||
lineNo++;
|
||||
|
||||
// parse out the required fields
|
||||
|
|
@ -594,6 +595,11 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec {
|
|||
if ( a.isSymbolic() )
|
||||
continue;
|
||||
|
||||
// we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong
|
||||
// position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine).
|
||||
if ( a.length() - clipping == 0 )
|
||||
return clipping - 1;
|
||||
|
||||
if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
|
||||
stillClipping = false;
|
||||
else if ( ref.length() == clipping )
|
||||
|
|
|
|||
|
|
@ -120,6 +120,8 @@ public class VCF3Codec extends AbstractVCFCodec {
|
|||
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
|
||||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
if ( nParts != genotypeParts.length )
|
||||
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
|
||||
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
|
||||
|
||||
|
|
|
|||
|
|
@ -147,6 +147,8 @@ public class VCFCodec extends AbstractVCFCodec {
|
|||
genotypeParts = new String[header.getColumnCount() - NUM_STANDARD_FIELDS];
|
||||
|
||||
int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR);
|
||||
if ( nParts != genotypeParts.length )
|
||||
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo);
|
||||
|
||||
ArrayList<Genotype> genotypes = new ArrayList<Genotype>(nParts);
|
||||
|
||||
|
|
|
|||
|
|
@ -184,11 +184,11 @@ public class UserException extends ReviewedStingException {
|
|||
|
||||
public static class MalformedVCF extends UserException {
|
||||
public MalformedVCF(String message, String line) {
|
||||
super(String.format("The provided VCF file is malformed at line %s: %s", line, message));
|
||||
super(String.format("The provided VCF file is malformed at approximately line %s: %s", line, message));
|
||||
}
|
||||
|
||||
public MalformedVCF(String message, int lineNo) {
|
||||
super(String.format("The provided VCF file is malformed at line number %d: %s", lineNo, message));
|
||||
super(String.format("The provided VCF file is malformed at approximately line number %d: %s", lineNo, message));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -200,6 +200,48 @@ public class ArtificialSAMUtils {
|
|||
return rec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an artificial read based on the parameters
|
||||
*
|
||||
* @param header the SAM header to associate the read with
|
||||
* @param name the name of the read
|
||||
* @param refIndex the reference index, i.e. what chromosome to associate it with
|
||||
* @param alignmentStart where to start the alignment
|
||||
* @param bases the sequence of the read
|
||||
* @param qual the qualities of the read
|
||||
* @param cigar the cigar string of the read
|
||||
*
|
||||
* @return the artificial read
|
||||
*/
|
||||
public static GATKSAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar ) {
|
||||
GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual);
|
||||
rec.setCigarString(cigar);
|
||||
return rec;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an artificial read with the following default parameters :
|
||||
* header:
|
||||
* numberOfChromosomes = 1
|
||||
* startingChromosome = 1
|
||||
* chromosomeSize = 1000000
|
||||
* read:
|
||||
* name = "default_read"
|
||||
* refIndex = 0
|
||||
* alignmentStart = 1
|
||||
*
|
||||
* @param bases the sequence of the read
|
||||
* @param qual the qualities of the read
|
||||
* @param cigar the cigar string of the read
|
||||
*
|
||||
* @return the artificial read
|
||||
*/
|
||||
public static GATKSAMRecord createArtificialRead( byte[] bases, byte[] qual, String cigar ) {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000);
|
||||
return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 1, bases, qual, cigar);
|
||||
}
|
||||
|
||||
|
||||
public final static List<GATKSAMRecord> createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) {
|
||||
GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen);
|
||||
GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen);
|
||||
|
|
|
|||
|
|
@ -410,6 +410,12 @@ public class GenotypesContext implements List<Genotype> {
|
|||
return getGenotypes().get(i);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets sample associated with this sampleName, or null if none is found
|
||||
*
|
||||
* @param sampleName
|
||||
* @return
|
||||
*/
|
||||
public Genotype get(final String sampleName) {
|
||||
Integer offset = getSampleI(sampleName);
|
||||
return offset == null ? null : getGenotypes().get(offset);
|
||||
|
|
@ -648,16 +654,15 @@ public class GenotypesContext implements List<Genotype> {
|
|||
@Ensures("result != null")
|
||||
public GenotypesContext subsetToSamples( final Set<String> samples ) {
|
||||
final int nSamples = samples.size();
|
||||
final int nGenotypes = size();
|
||||
|
||||
if ( nSamples == nGenotypes )
|
||||
return this;
|
||||
else if ( nSamples == 0 )
|
||||
if ( nSamples == 0 )
|
||||
return NO_GENOTYPES;
|
||||
else { // nGenotypes < nSamples
|
||||
final GenotypesContext subset = create(samples.size());
|
||||
for ( final String sample : samples ) {
|
||||
subset.add(get(sample));
|
||||
final Genotype g = get(sample);
|
||||
if ( g != null )
|
||||
subset.add(g);
|
||||
}
|
||||
return subset;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,21 +83,20 @@ public class ExactAFCalculationModelUnitTest extends BaseTest {
|
|||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final double[][] log10AlleleFrequencyLikelihoods = new double[2][2*numSamples+1];
|
||||
final double[][] log10AlleleFrequencyPosteriors = new double[2][2*numSamples+1];
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2, 2*numSamples);
|
||||
for ( int i = 0; i < 2; i++ ) {
|
||||
for ( int j = 0; j < 2*numSamples+1; j++ ) {
|
||||
log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
result.log10AlleleFrequencyLikelihoods[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
result.log10AlleleFrequencyPosteriors[i][j] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED;
|
||||
}
|
||||
}
|
||||
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors, false);
|
||||
ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result, false);
|
||||
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors[allele]);
|
||||
int calculatedAlleleCount = MathUtils.maxElementIndex(result.log10AlleleFrequencyPosteriors[allele]);
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -129,8 +129,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
|
|||
public void testOutputParameter() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "-sites_only", "44f3b5b40e6ad44486cddfdb7e0bfcd8" );
|
||||
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "94e53320f14c5ff29d62f68d36b46fcd" );
|
||||
e.put( "--output_mode EMIT_ALL_SITES", "73ad1cc41786b12c5f0e6f3e9ec2b728" );
|
||||
e.put( "--output_mode EMIT_ALL_CONFIDENT_SITES", "274bd9d1b9c7857690fa5f0228ffc6d7" );
|
||||
e.put( "--output_mode EMIT_ALL_SITES", "594c6d3c48bbc73289de7727d768644d" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -24,6 +24,18 @@ public class ClipReadsTestUtils {
|
|||
final static String BASES = "ACTG";
|
||||
final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63
|
||||
|
||||
public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) {
|
||||
// If they're both not empty, test their contents
|
||||
if(!actual.isEmpty() && !expected.isEmpty()) {
|
||||
Assert.assertEquals(actual.getReadBases(), expected.getReadBases());
|
||||
Assert.assertEquals(actual.getBaseQualities(), expected.getBaseQualities());
|
||||
Assert.assertEquals(actual.getCigarString(), expected.getCigarString());
|
||||
}
|
||||
// Otherwise test if they're both empty
|
||||
else
|
||||
Assert.assertEquals(actual.isEmpty(), expected.isEmpty());
|
||||
}
|
||||
|
||||
public static void testBaseQualCigar(GATKSAMRecord read, byte[] readBases, byte[] baseQuals, String cigar) {
|
||||
// Because quals to char start at 33 for visibility
|
||||
baseQuals = subtractToArray(baseQuals, 33);
|
||||
|
|
@ -48,7 +60,7 @@ public class ClipReadsTestUtils {
|
|||
Assert.assertTrue(read.isEmpty());
|
||||
}
|
||||
|
||||
private static byte[] subtractToArray(byte[] array, int n) {
|
||||
public static byte[] subtractToArray(byte[] array, int n) {
|
||||
if (array == null)
|
||||
return null;
|
||||
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.TextCigarCodec;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
|
|
@ -230,42 +231,25 @@ public class ReadClipperUnitTest extends BaseTest {
|
|||
|
||||
@Test(enabled = true)
|
||||
public void testHardClipLowQualEnds() {
|
||||
// Needs a thorough redesign
|
||||
logger.warn("Executing testHardClipByReferenceCoordinates");
|
||||
logger.warn("Executing testHardClipLowQualEnds");
|
||||
|
||||
//Clip whole read
|
||||
Assert.assertEquals(readClipper.hardClipLowQualEnds((byte) 64), new GATKSAMRecord(readClipper.read.getHeader()));
|
||||
// Testing clipping that ends inside an insertion
|
||||
final byte[] BASES = {'A','C','G','T','A','C','G','T'};
|
||||
final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2};
|
||||
final String CIGAR = "1S1M5I1S";
|
||||
|
||||
List<TestParameter> testList = new LinkedList<TestParameter>();
|
||||
testList.add(new TestParameter(1, -1, 1, 4, "1H3M"));//clip 1 base at start
|
||||
testList.add(new TestParameter(11, -1, 2, 4, "2H2M"));//clip 2 bases at start
|
||||
final byte[] CLIPPED_BASES = {};
|
||||
final byte[] CLIPPED_QUALS = {};
|
||||
final String CLIPPED_CIGAR = "";
|
||||
|
||||
for (TestParameter p : testList) {
|
||||
init();
|
||||
//logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
|
||||
ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipLowQualEnds((byte) p.inputStart),
|
||||
ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(),
|
||||
ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(),
|
||||
p.cigar);
|
||||
}
|
||||
/* todo find a better way to test lowqual tail clipping on both sides
|
||||
// Reverse Quals sequence
|
||||
readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
|
||||
|
||||
testList = new LinkedList<testParameter>();
|
||||
testList.add(new testParameter(1,-1,0,3,"3M1H"));//clip 1 base at end
|
||||
testList.add(new testParameter(11,-1,0,2,"2M2H"));//clip 2 bases at end
|
||||
GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR);
|
||||
GATKSAMRecord expected = ArtificialSAMUtils.createArtificialRead(CLIPPED_BASES, CLIPPED_QUALS, CLIPPED_CIGAR);
|
||||
|
||||
ReadClipper lowQualClipper = new ReadClipper(read);
|
||||
ClipReadsTestUtils.assertEqualReads(lowQualClipper.hardClipLowQualEnds((byte) 2), expected);
|
||||
|
||||
|
||||
for ( testParameter p : testList ) {
|
||||
init();
|
||||
readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33
|
||||
//logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar);
|
||||
testBaseQualCigar( readClipper.hardClipLowQualEnds( (byte)p.inputStart ),
|
||||
BASES.substring(p.substringStart,p.substringStop).getBytes(),
|
||||
QUALS.substring(p.substringStart,p.substringStop),
|
||||
p.cigar );
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
@Test(enabled = false)
|
||||
|
|
|
|||
|
|
@ -704,11 +704,14 @@ public class VariantContextUnitTest extends BaseTest {
|
|||
public Object[][] MakeSubContextTest() {
|
||||
for ( boolean updateAlleles : Arrays.asList(true, false)) {
|
||||
new SubContextTest(Collections.<String>emptySet(), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("MISSING"), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("AA"), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("AT"), updateAlleles);
|
||||
new SubContextTest(Collections.singleton("TT"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT", "TT"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT", "MISSING"), updateAlleles);
|
||||
new SubContextTest(Arrays.asList("AA", "AT", "TT", "MISSING"), updateAlleles);
|
||||
}
|
||||
|
||||
return SubContextTest.getTests(SubContextTest.class);
|
||||
|
|
|
|||
Loading…
Reference in New Issue