Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2012-04-09 14:53:19 -04:00
commit 719ec9144a
26 changed files with 1893 additions and 1545 deletions

View File

@ -955,8 +955,8 @@
<jvmarg value="-Dpipeline.run=${pipeline.run}" />
<jvmarg value="-Djava.io.tmpdir=${java.io.tmpdir}" />
<jvmarg line="${cofoja.jvm.args}"/>
<!-- <jvmarg value="-Xdebug"/> -->
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5678"/> -->
<!-- <jvmarg value="-Xdebug"/> -->
<!-- <jvmarg value="-Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=5005"/> -->
<classfileset dir="${java.public.test.classes}" includes="**/@{testtype}.class"/>
<classfileset dir="${java.private.test.classes}" erroronmissingdir="false">

View File

@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.*;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
@ -35,8 +37,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
@ -45,6 +45,8 @@ import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
@ -190,7 +192,7 @@ public class GenomeAnalysisEngine {
private BaseRecalibration baseRecalibration = null;
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
public void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
/**
* Actually run the GATK with the specified walker.
@ -216,7 +218,7 @@ public class GenomeAnalysisEngine {
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
if (this.getArguments().BQSR_RECAL_FILE != null)
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
// Determine how the threads should be divided between CPU vs. IO.
determineThreadAllocation();

View File

@ -193,6 +193,16 @@ public class GATKArgumentCollection {
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
/**
* Turns on the base quantization module. It requires a recalibration report (-BQSR).
*
* A value of 0 here means "do not quantize".
* Any value greater than zero will be used to recalculate the quantization using this many levels.
* Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
*/
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
public int quantizationLevels = -1;
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
public byte defaultBaseQualities = -1;

View File

@ -191,6 +191,16 @@ public class ReferenceContext {
return basesCache;
}
/**
* All the bases in the window from the current base forward to the end of the window.
*/
public byte[] getForwardBases() {
final byte[] bases = getBases();
final int mid = locus.getStart() - window.getStart();
// todo -- warning of performance problem, especially if this is called over and over
return new String(bases).substring(mid).getBytes();
}
@Deprecated
public char getBaseAsChar() {
return (char)getBase();

View File

@ -19,13 +19,19 @@ import java.util.Map;
public class QuantizationInfo {
private List<Byte> quantizedQuals;
private List<Long> empiricalQualCounts;
int quantizationLevels;
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts, int quantizationLevels) {
this.quantizedQuals = quantizedQuals;
this.empiricalQualCounts = empiricalQualCounts;
this.quantizationLevels = quantizationLevels;
}
public QuantizationInfo(List<Byte> quantizedQuals, List<Long> empiricalQualCounts) {
this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals));
}
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int nLevels) {
public QuantizationInfo(Map<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap, int quantizationLevels) {
final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution
for (int i = 0; i < qualHistogram.length; i++)
qualHistogram[i] = 0L;
@ -46,7 +52,9 @@ public class QuantizationInfo {
qualHistogram[empiricalQual] += nObservations; // add the number of observations for every key
}
empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities
quantizeQualityScores(nLevels);
quantizeQualityScores(quantizationLevels);
this.quantizationLevels = quantizationLevels;
}
@ -55,10 +63,20 @@ public class QuantizationInfo {
quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC)
}
public void noQuantization() {
this.quantizationLevels = QualityUtils.MAX_QUAL_SCORE;
for (int i = 0; i < this.quantizationLevels; i++)
quantizedQuals.set(i, (byte) i);
}
public List<Byte> getQuantizedQuals() {
return quantizedQuals;
}
public int getQuantizationLevels() {
return quantizationLevels;
}
public GATKReportTable generateReportTable() {
GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map");
quantizedTable.addPrimaryKey(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
@ -71,4 +89,16 @@ public class QuantizationInfo {
}
return quantizedTable;
}
private static int calculateQuantizationLevels(List<Byte> quantizedQuals) {
byte lastByte = -1;
int quantizationLevels = 0;
for (byte q : quantizedQuals) {
if (q != lastByte) {
quantizationLevels++;
lastByte = q;
}
}
return quantizationLevels;
}
}

View File

@ -25,8 +25,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
* OTHER DEALINGS IN THE SOFTWARE.
*/
import org.broadinstitute.sting.utils.QualityUtils;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -37,10 +35,10 @@ import org.broadinstitute.sting.utils.QualityUtils;
public class RecalDatum extends RecalDatumOptimized {
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero
private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero
//---------------------------------------------------------------------------------------------------------------
//
@ -110,7 +108,11 @@ public class RecalDatum extends RecalDatumOptimized {
}
private double calcExpectedErrors() {
return (double) this.numObservations * QualityUtils.qualToProb(estimatedQReported);
return (double) this.numObservations * qualToErrorProb(estimatedQReported);
}
private double qualToErrorProb(final double qual) {
return Math.pow(10.0, qual / -10.0);
}
/**

View File

@ -139,7 +139,7 @@ public class RecalibrationReport {
columnNamesOrderedList.add(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.COVARIATE_NAME_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false);
}
/**
@ -154,7 +154,7 @@ public class RecalibrationReport {
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.QUALITY_SCORE_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, false);
}
/**
@ -168,7 +168,7 @@ public class RecalibrationReport {
ArrayList<String> columnNamesOrderedList = new ArrayList<String>(2);
columnNamesOrderedList.add(RecalDataManager.READGROUP_COLUMN_NAME);
columnNamesOrderedList.add(RecalDataManager.EVENT_TYPE_COLUMN_NAME);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList);
return genericRecalTableParsing(keyManager, reportTable, columnNamesOrderedList, true);
}
/**
@ -179,7 +179,7 @@ public class RecalibrationReport {
* @param columnNamesOrderedList a list of columns to read from the report table and build as key for this particular table
* @return a lookup table indexed by bitsets containing the empirical quality and estimated quality reported for every key.
*/
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList) {
private Map<BitSet, RecalDatum> genericRecalTableParsing(BQSRKeyManager keyManager, GATKReportTable reportTable, ArrayList<String> columnNamesOrderedList, boolean hasEstimatedQReportedColumn) {
Map<BitSet, RecalDatum> result = new HashMap<BitSet, RecalDatum>(reportTable.getNumRows()*2);
for (Object primaryKey : reportTable.getPrimaryKeys()) {
@ -192,10 +192,13 @@ public class RecalibrationReport {
long nObservations = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME);
long nErrors = (Long) reportTable.get(primaryKey, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME);
double estimatedQReported = (Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME);
double empiricalQuality = (Double) reportTable.get(primaryKey, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME);
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table
(Double) reportTable.get(primaryKey, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table
Byte.parseByte((String) reportTable.get(primaryKey, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table
RecalDatum recalDatum = new RecalDatum(nObservations, nErrors, estimatedQReported, empiricalQuality);
result.put(bitKey, recalDatum);
}
return result;

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.utils.MathUtils;
import java.util.ArrayList;
import java.util.Arrays;
/**
@ -42,12 +41,13 @@ public class AlleleFrequencyCalculationResult {
// These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles
private double log10MLE;
private double log10MAP;
final private int[] alleleCountsOfMLE;
final private int[] alleleCountsOfMAP;
private final int[] alleleCountsOfMLE;
private final int[] alleleCountsOfMAP;
// The posteriors seen, not including that of AF=0
// TODO -- better implementation needed here (see below)
private ArrayList<Double> log10PosteriorMatrixValues = new ArrayList<Double>(100000);
private static final int POSTERIORS_CACHE_SIZE = 5000;
private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE];
private int currentPosteriorsCacheIndex = 0;
private Double log10PosteriorMatrixSum = null;
// These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles)
@ -69,14 +69,9 @@ public class AlleleFrequencyCalculationResult {
return log10MAP;
}
public double getLog10PosteriorMatrixSum() {
public double getLog10PosteriorsMatrixSumWithoutAFzero() {
if ( log10PosteriorMatrixSum == null ) {
// TODO -- we absolutely need a better implementation here as we don't want to store all values from the matrix in memory;
// TODO -- will discuss with the team what the best option is
final double[] tmp = new double[log10PosteriorMatrixValues.size()];
for ( int i = 0; i < tmp.length; i++ )
tmp[i] = log10PosteriorMatrixValues.get(i);
log10PosteriorMatrixSum = MathUtils.log10sumLog10(tmp);
log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
}
return log10PosteriorMatrixSum;
}
@ -103,7 +98,7 @@ public class AlleleFrequencyCalculationResult {
alleleCountsOfMLE[i] = 0;
alleleCountsOfMAP[i] = 0;
}
log10PosteriorMatrixValues.clear();
currentPosteriorsCacheIndex = 0;
log10PosteriorMatrixSum = null;
}
@ -116,7 +111,8 @@ public class AlleleFrequencyCalculationResult {
}
public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) {
log10PosteriorMatrixValues.add(log10LofK);
addToPosteriorsCache(log10LofK);
if ( log10LofK > log10MAP ) {
log10MAP = log10LofK;
for ( int i = 0; i < alleleCountsForK.length; i++ )
@ -124,6 +120,18 @@ public class AlleleFrequencyCalculationResult {
}
}
private void addToPosteriorsCache(final double log10LofK) {
// add to the cache
log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK;
// if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell
if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) {
final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex);
log10PosteriorMatrixValues[0] = temporarySum;
currentPosteriorsCacheIndex = 1;
}
}
public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) {
this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero;
if ( log10LikelihoodOfAFzero > log10MLE ) {

View File

@ -72,6 +72,8 @@ import static java.lang.Math.pow;
*/
public class DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering implements Cloneable {
public final static double DEFAULT_PCR_ERROR_RATE = 1e-4;
protected final static int FIXED_PLOIDY = 2;
protected final static int MAX_PLOIDY = FIXED_PLOIDY + 1;
protected final static double ploidyAdjustment = log10(FIXED_PLOIDY);

View File

@ -45,7 +45,7 @@ public class UnifiedArgumentCollection {
* het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
*/
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
/**
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
@ -53,7 +53,7 @@ public class UnifiedArgumentCollection {
* effectively acts as a cap on the base qualities.
*/
@Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
public Double PCR_error = DiploidSNPGenotypeLikelihoodsWithCorrectAlleleOrdering.DEFAULT_PCR_ERROR_RATE;
/**
* Specifies how to determine the alternate allele to use for genotyping

View File

@ -51,6 +51,9 @@ import java.util.*;
public class UnifiedGenotyperEngine {
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
public enum OUTPUT_MODE {
/** produces calls only at variant sites */
EMIT_VARIANTS_ONLY,
@ -324,7 +327,7 @@ public class UnifiedGenotyperEngine {
} else {
phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF);
if ( Double.isInfinite(phredScaledConfidence) ) {
final double sum = AFresult.getLog10PosteriorMatrixSum();
final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum);
}
}
@ -367,7 +370,7 @@ public class UnifiedGenotyperEngine {
// the overall lod
//double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0];
double overallLog10PofF = AFresult.getLog10PosteriorMatrixSum();
double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF);
List<Allele> alternateAllelesToUse = builder.make().getAlternateAlleles();
@ -378,7 +381,7 @@ public class UnifiedGenotyperEngine {
afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult);
//double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double forwardLog10PofF = AFresult.getLog10PosteriorMatrixSum();
double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF);
// the reverse lod
@ -387,7 +390,7 @@ public class UnifiedGenotyperEngine {
afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult);
//normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true);
double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero();
double reverseLog10PofF = AFresult.getLog10PosteriorMatrixSum();
double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
//if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF);
double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF;
@ -422,7 +425,7 @@ public class UnifiedGenotyperEngine {
public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) {
normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero();
normalizedPosteriors[1] = AFresult.getLog10PosteriorMatrixSum();
normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero();
return MathUtils.normalizeFromLog10(normalizedPosteriors);
}
@ -620,8 +623,6 @@ public class UnifiedGenotyperEngine {
}
public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
protected double getTheta( final GenotypeLikelihoodsCalculationModel.Model model ) {
if( model.name().contains("SNP") )
return HUMAN_SNP_HETEROZYGOSITY;

View File

@ -116,6 +116,15 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
/**
* Some analyses want to count overlap not with dbSNP (which is in general very open) but
* actually want to itemize their overlap specifically with a set of gold standard sites
* such as HapMap, OMNI, or the gold standard indels. Theis argument provides a mechanism
* for communicating which file to use
*/
@Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false)
public RodBinding<VariantContext> goldStandard = null;
// Help arguments
@Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
protected Boolean LIST = false;

View File

@ -56,6 +56,12 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@DataPoint(description = "Number of singleton Indels", format = "%d")
public int n_singleton_indels = 0;
@DataPoint(description = "Number of Indels overlapping gold standard sites", format = "%d")
public int n_indels_matching_gold_standard = 0;
@DataPoint(description = "Percent of indels overlapping gold standard sites")
public String gold_standard_matching_rate;
// counts 1 for each site where the number of alleles > 2
public int nMultiIndelSites = 0;
@ -71,18 +77,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@DataPoint(description = "Indel novelty rate")
public String indel_novelty_rate;
@DataPoint(description = "1 to 2 bp indel ratio")
public String ratio_of_1_to_2_bp_indels;
@DataPoint(description = "1 to 3 bp indel ratio")
public String ratio_of_1_to_3_bp_indels;
@DataPoint(description = "2 to 3 bp indel ratio")
public String ratio_of_2_to_3_bp_indels;
@DataPoint(description = "1 and 2 to 3 bp indel ratio")
public String ratio_of_1_and_2_to_3_bp_indels;
@DataPoint(description = "Frameshift percent")
public String frameshift_rate_for_coding_indels;
@ -92,9 +86,6 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
@DataPoint(description = "Insertion to deletion ratio")
public String insertion_to_deletion_ratio;
@DataPoint(description = "Insertion to deletion ratio for 1 bp events")
public String insertion_to_deletion_ratio_for_1bp_indels;
//
// Frameshifts
//
@ -116,9 +107,25 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
int nSNPHets = 0, nSNPHoms = 0, nIndelHets = 0, nIndelHoms = 0;
int nKnownIndels = 0, nInsertions = 0;
int n1bpInsertions = 0, n1bpDeletions = 0;
int[] countByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
int[] insertionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
int[] deletionCountByLength = new int[]{0, 0, 0, 0}; // note that the first element isn't used
// - Since 1 & 2 bp insertions and 1 & 2 bp deletions are equally likely to cause a
// downstream frameshift, if we make the simplifying assumptions that 3 bp ins
// and 3bp del (adding/subtracting 1 AA in general) are roughly comparably
// selected against, we should see a consistent 1+2 : 3 bp ratio for insertions
// as for deletions, and certainly would expect consistency between in/dels that
// multiple methods find and in/dels that are unique to one method (since deletions
// are more common and the artifacts differ, it is probably worth looking at the totals,
// overlaps and ratios for insertions and deletions separately in the methods
// comparison and in this case don't even need to make the simplifying in = del functional assumption
@DataPoint(description = "ratio of 1 and 2 bp insertions to 3 bp insertions")
public String ratio_of_1_and_2_to_3_bp_insertions;
@DataPoint(description = "ratio of 1 and 2 bp deletions to 3 bp deletions")
public String ratio_of_1_and_2_to_3_bp_deletions;
public final static int LARGE_INDEL_SIZE_THRESHOLD = 10;
@ -150,11 +157,11 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
}
break;
case INDEL:
final VariantContext gold = getWalker().goldStandard == null ? null : tracker.getFirstValue(getWalker().goldStandard);
if ( eval.isComplexIndel() ) break; // don't count complex substitutions
nIndelSites++;
if ( ! eval.isBiallelic() ) nMultiIndelSites++;
if ( variantWasSingleton(eval) ) n_singleton_indels++;
// collect information about het / hom ratio
for ( final Genotype g : eval.getGenotypes() ) {
@ -164,15 +171,14 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
for ( Allele alt : eval.getAlternateAlleles() ) {
n_indels++; // +1 for each alt allele
if ( variantWasSingleton(eval) ) n_singleton_indels++;
if ( comp != null ) nKnownIndels++; // TODO -- make this test allele specific?
if ( gold != null ) n_indels_matching_gold_standard++;
// ins : del ratios
final int alleleSize = alt.length() - eval.getReference().length();
if ( alleleSize == 0 ) throw new ReviewedStingException("Allele size not expected to be zero for indel: alt = " + alt + " ref = " + eval.getReference());
if ( alleleSize > 0 ) nInsertions++;
if ( alleleSize == 1 ) n1bpInsertions++;
if ( alleleSize == -1 ) n1bpDeletions++;
// requires snpEFF annotations
if ( eval.getAttributeAsString("SNPEFF_GENE_BIOTYPE", "missing").equals("protein_coding") ) {
@ -193,6 +199,7 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
n_large_deletions++;
// update the baby histogram
final int[] countByLength = alleleSize < 0 ? deletionCountByLength : insertionCountByLength;
final int absSize = Math.abs(alleleSize);
if ( absSize < countByLength.length ) countByLength[absSize]++;
@ -210,18 +217,18 @@ public class IndelSummary extends VariantEvaluator implements StandardEval {
percent_of_sites_with_more_than_2_alleles = Utils.formattedRatio(nMultiIndelSites, nIndelSites);
SNP_to_indel_ratio = Utils.formattedRatio(n_SNPs, n_indels);
SNP_to_indel_ratio_for_singletons = Utils.formattedRatio(n_singleton_SNPs, n_singleton_indels);
gold_standard_matching_rate = Utils.formattedNoveltyRate(n_indels_matching_gold_standard, n_indels);
indel_novelty_rate = Utils.formattedNoveltyRate(nKnownIndels, n_indels);
ratio_of_1_to_2_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[2]);
ratio_of_1_to_3_bp_indels = Utils.formattedRatio(countByLength[1], countByLength[3]);
ratio_of_2_to_3_bp_indels = Utils.formattedRatio(countByLength[2], countByLength[3]);
ratio_of_1_and_2_to_3_bp_indels = Utils.formattedRatio(countByLength[1] + countByLength[2], countByLength[3]);
frameshift_rate_for_coding_indels = Utils.formattedPercent(n_coding_indels_frameshifting, n_coding_indels_in_frame + n_coding_indels_frameshifting);
ratio_of_1_and_2_to_3_bp_deletions = Utils.formattedRatio(deletionCountByLength[1] + deletionCountByLength[2], deletionCountByLength[3]);
ratio_of_1_and_2_to_3_bp_insertions = Utils.formattedRatio(insertionCountByLength[1] + insertionCountByLength[2], insertionCountByLength[3]);
SNP_het_to_hom_ratio = Utils.formattedRatio(nSNPHets, nSNPHoms);
indel_het_to_hom_ratio = Utils.formattedRatio(nIndelHets, nIndelHoms);
insertion_to_deletion_ratio = Utils.formattedRatio(nInsertions, n_indels - nInsertions);
insertion_to_deletion_ratio_for_1bp_indels = Utils.formattedRatio(n1bpInsertions, n1bpDeletions);
insertion_to_deletion_ratio_for_large_indels = Utils.formattedRatio(n_large_insertions, n_large_deletions);
}

View File

@ -0,0 +1,59 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Stratifies the eval RODs into sites where the indel is 1 bp in length and those where the event is 2+.
* all non indel events go into all bins, so that SNP counts can be used as contrasts in eval modules.
*/
public class OneBPIndel extends VariantStratifier {
private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"one.bp", (Object)"two.plus.bp");
private final static List<Object> ONE_BP = Arrays.asList((Object)"all", (Object)"one.bp");
private final static List<Object> TWO_PLUS_BP = Arrays.asList((Object)"all", (Object)"two.plus.bp");
@Override
public void initialize() {
states.addAll(ALL);
}
@Override
public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
if (eval != null && eval.isIndel()) {
for ( int l : eval.getIndelLengths() )
if ( l > 1 )
return TWO_PLUS_BP; // someone is too long
return ONE_BP; // all lengths are one
} else
return ALL;
}
}

View File

@ -0,0 +1,67 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.Arrays;
import java.util.List;
/**
* Stratifies the eval RODs into sites that are tandem repeats
*/
public class TandemRepeat extends VariantStratifier {
private final static List<Object> JUST_ALL = Arrays.asList((Object)"all");
private final static List<Object> ALL = Arrays.asList((Object)"all", (Object)"is.repeat", (Object)"not.repeat");
private final static List<Object> REPEAT = Arrays.asList((Object)"all", (Object)"is.repeat");
private final static List<Object> NOT_REPEAT = Arrays.asList((Object)"all", (Object)"not.repeat");
@Override
public void initialize() {
states.addAll(ALL);
}
@Override
public List<Object> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
if ( eval == null || ! eval.isIndel() )
return ALL;
else if ( VariantContextUtils.isTandemRepeat(eval, ref.getForwardBases()) ) {
print("REPEAT", eval, ref);
return REPEAT;
} else {
print("NOT A REPEAT", eval, ref);
return NOT_REPEAT;
}
}
private final void print(String prefix, VariantContext eval, ReferenceContext ref) {
// String alleles = ParsingUtils.sortList(eval.getAlleles()).toString();
// this.getVariantEvalWalker().getLogger().info(prefix + ": " + "pos=" + eval.getStart() + " alleles=" + alleles + " ref=" + new String(ref.getForwardBases()));
}
}

View File

@ -27,12 +27,11 @@ package org.broadinstitute.sting.utils;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
@ -41,6 +40,7 @@ public class Haplotype {
protected final double[] quals;
private GenomeLoc genomeLocation = null;
private HashMap<String, double[]> readLikelihoodsPerSample = null;
private HashMap<Integer, VariantContext> eventMap = null;
private boolean isRef = false;
private Cigar cigar;
private int alignmentStartHapwrtRef;
@ -97,6 +97,14 @@ public class Haplotype {
return readLikelihoodsPerSample.keySet();
}
public HashMap<Integer, VariantContext> getEventMap() {
return eventMap;
}
public void setEventMap( final HashMap<Integer, VariantContext> eventMap ) {
this.eventMap = eventMap;
}
public boolean isReference() {
return isRef;
}

View File

@ -224,10 +224,6 @@ public class IndelUtils {
return inds;
}
public static String[] getIndelClassificationNames() {
return COLUMN_KEYS;
}
public static String getIndelClassificationName(int k) {
if (k >=0 && k < COLUMN_KEYS.length)
return COLUMN_KEYS[k];
@ -235,35 +231,6 @@ public class IndelUtils {
throw new ReviewedStingException("Invalid index when trying to get indel classification name");
}
public static boolean isATExpansion(VariantContext vc, ReferenceContext ref) {
ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
boolean isIt = false;
for (int k : inds) {
if (k == IND_FOR_REPEAT_EXPANSION_A || k == IND_FOR_REPEAT_EXPANSION_T) {
isIt = true;
break;
}
}
return isIt;
}
public static boolean isCGExpansion(VariantContext vc, ReferenceContext ref) {
ArrayList<Integer> inds = findEventClassificationIndex(vc, ref);
boolean isIt = false;
for (int k : inds) {
if (k == IND_FOR_REPEAT_EXPANSION_C || k == IND_FOR_REPEAT_EXPANSION_G) {
isIt = true;
break;
}
}
return isIt;
}
public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) {
return (vc.getStart() != ref.getLocus().getStart());
}

View File

@ -237,7 +237,7 @@ public class MathUtils {
public static double log10sumLog10(double[] log10p, int start, int finish) {
double sum = 0.0;
double maxValue = Utils.findMaxEntry(log10p);
double maxValue = arrayMax(log10p, finish);
if(maxValue == Double.NEGATIVE_INFINITY)
return sum;
@ -554,7 +554,7 @@ public class MathUtils {
// for precision purposes, we need to add (or really subtract, since they're
// all negative) the largest value; also, we need to convert to normal-space.
double maxValue = Utils.findMaxEntry(array);
double maxValue = arrayMax(array);
// we may decide to just normalize in log space without converting to linear space
if (keepInLogSpace) {
@ -627,10 +627,14 @@ public class MathUtils {
return maxI;
}
public static double arrayMax(double[] array) {
public static double arrayMax(final double[] array) {
return array[maxElementIndex(array)];
}
public static double arrayMax(final double[] array, final int endIndex) {
return array[maxElementIndex(array, endIndex)];
}
public static double arrayMin(double[] array) {
return array[minElementIndex(array)];
}

View File

@ -290,32 +290,6 @@ public class Utils {
return m;
}
// returns the maximum value in the array
public static double findMaxEntry(double[] array) {
return findIndexAndMaxEntry(array).first;
}
// returns the index of the maximum value in the array
public static int findIndexOfMaxEntry(double[] array) {
return findIndexAndMaxEntry(array).second;
}
// returns the the maximum value and its index in the array
private static Pair<Double, Integer> findIndexAndMaxEntry(double[] array) {
if ( array.length == 0 )
return new Pair<Double, Integer>(0.0, -1);
int index = 0;
double max = array[0];
for (int i = 1; i < array.length; i++) {
if ( array[i] > max ) {
max = array[i];
index = i;
}
}
return new Pair<Double, Integer>(max, index);
}
/**
* Splits expressions in command args by spaces and returns the array of expressions.
* Expressions may use single or double quotes to group any individual expression, but not both.

View File

@ -44,23 +44,24 @@ import java.util.*;
public class BaseRecalibration {
private QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done)
private LinkedHashMap<BQSRKeyManager, Map<BitSet, RecalDatum>> keysAndTablesMap; // quick access reference to the read group table and its key manager
private ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // list of all covariates to be used in this calculation
private static String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
private static String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
/**
* Constructor using a GATK Report file
*
* @param RECAL_FILE a GATK Report file containing the recalibration information
*/
public BaseRecalibration(final File RECAL_FILE) {
public BaseRecalibration(final File RECAL_FILE, int quantizationLevels) {
RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE);
quantizationInfo = recalibrationReport.getQuantizationInfo();
keysAndTablesMap = recalibrationReport.getKeysAndTablesMap();
requestedCovariates = recalibrationReport.getRequestedCovariates();
quantizationInfo = recalibrationReport.getQuantizationInfo();
if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores
quantizationInfo.noQuantization();
else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report.
quantizationInfo.quantizeQualityScores(quantizationLevels);
}
/**
@ -71,17 +72,17 @@ public class BaseRecalibration {
* @param read the read to recalibrate
*/
public void recalibrateRead(final GATKSAMRecord read) {
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
final ReadCovariates readCovariates = RecalDataManager.computeCovariates(read, requestedCovariates); // compute all covariates for the read
for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings
final byte[] originalQuals = read.getBaseQualities(errorModel);
final byte[] recalQuals = originalQuals.clone();
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
for (int offset = 0; offset < read.getReadLength(); offset++) { // recalibrate all bases in the read
byte qualityScore = originalQuals[offset];
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
if (qualityScore > QualityUtils.MIN_USABLE_Q_SCORE) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
final BitSet[] keySet = readCovariates.getKeySet(offset, errorModel); // get the keyset for this base using the error model
qualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base
}
recalQuals[offset] = qualityScore;
}
@ -109,6 +110,9 @@ public class BaseRecalibration {
* @return A recalibrated quality score as a byte
*/
private byte performSequentialQualityCalculation(BitSet[] key, EventType errorModel) {
final String UNRECOGNIZED_REPORT_TABLE_EXCEPTION = "Unrecognized table. Did you add an extra required covariate? This is a hard check that needs propagate through the code";
final String TOO_MANY_KEYS_EXCEPTION = "There should only be one key for the RG collapsed table, something went wrong here";
final byte qualFromRead = (byte) BitSetUtils.shortFrom(key[1]);
double globalDeltaQ = 0.0;

View File

@ -1202,4 +1202,80 @@ public class VariantContextUtils {
final double qual = numNewAltAlleles == 0 ? Genotype.NO_LOG10_PERROR : GenotypeLikelihoods.getQualFromLikelihoods(PLindex, newLikelihoods);
return new Genotype(originalGT.getSampleName(), myAlleles, qual, null, attrs, false);
}
/**
* Returns true iff VC is an non-complex indel where every allele represents an expansion or
* contraction of a series of identical bases in the reference.
*
* For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT
*
* If VC = -/CT, then this function returns true because the CT insertion matches exactly the
* upcoming reference.
* If VC = -/CTA then this function returns false because the CTA isn't a perfect match
*
* Now consider deletions:
*
* If VC = CT/- then again the same logic applies and this returns true
* The case of CTA/- makes no sense because it doesn't actually match the reference bases.
*
* The logic of this function is pretty simple. Take all of the non-null alleles in VC. For
* each insertion allele of n bases, check if that allele matches the next n reference bases.
* For each deletion allele of n bases, check if this matches the reference bases at n - 2 n,
* as it must necessarily match the first n bases. If this test returns true for all
* alleles you are a tandem repeat, otherwise you are not.
*
* @param vc
* @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference
* @return
*/
@Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"})
public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) {
final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1);
if ( ! vc.isIndel() ) // only indels are tandem repeats
return false;
final Allele ref = vc.getReference();
for ( final Allele allele : vc.getAlternateAlleles() ) {
if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) )
return false;
}
// we've passed all of the tests, so we are a repeat
return true;
}
/**
* Helper function for isTandemRepeat that checks that allele matches somewhere on the reference
* @param ref
* @param alt
* @param refBasesStartingAtVCWithoutPad
* @return
*/
protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) {
if ( ! Allele.oneIsPrefixOfOther(ref, alt) )
return false; // we require one allele be a prefix of another
if ( ref.length() > alt.length() ) { // we are a deletion
return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2);
} else { // we are an insertion
return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1);
}
}
protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) {
final String potentialRepeat = l.substring(s.length()); // skip s bases
for ( int i = 0; i < minNumberOfMatches; i++) {
final int start = i * potentialRepeat.length();
final int end = (i+1) * potentialRepeat.length();
if ( ref.length() < end )
return false; // we ran out of bases to test
final String refSub = ref.substring(start, end);
if ( ! refSub.equals(potentialRepeat) )
return false; // repeat didn't match, fail
}
return true; // we passed all tests, we matched
}
}

View File

@ -302,7 +302,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --eval " + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf" +
" --comp:comp_genotypes,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.head.vcf";
WalkerTestSpec spec = new WalkerTestSpec(withSelect(tests, "DP < 50", "DP50") + " " + extraArgs + " -ST CpG -o %s",
1, Arrays.asList("c8a782f51e094dc7be06dbfb795feab2"));
1, Arrays.asList("4c00cfa0fd343fef62d19af0edeb4f65"));
executeTestParallel("testSelect1", spec);
}
@ -330,7 +330,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
@Test
public void testCompVsEvalAC() {
String extraArgs = "-T VariantEval -R "+b36KGReference+" -o %s -ST CpG -EV GenotypeConcordance --eval:evalYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.ug.very.few.lines.vcf --comp:compYRI,VCF3 " + validationDataLocation + "yri.trio.gatk.fake.genotypes.ac.test.vcf";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("5c409a2ab4517f862c6678902c0fd7a1"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("4df6654860ad63b7e24e6bc5fbbbcb00"));
executeTestParallel("testCompVsEvalAC",spec);
}
@ -360,7 +360,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --dbsnp " + b37dbSNP132 +
" --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
" -noST -ST Novelty -o %s";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("a27c700eabe6b7b3877c8fd4eabb3975"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3b85cd0fa37539ff51d34e026f26fef2"));
executeTestParallel("testEvalTrackWithoutGenotypes",spec);
}
@ -372,7 +372,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
" --eval:evalBI " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bi.sites.vcf" +
" --eval:evalBC " + validationDataLocation + "VariantEval/ALL.20100201.chr20.bc.sites.vcf" +
" -noST -ST Novelty -o %s";
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("3272a2db627d4f42bc512df49a8ea64b"));
WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("bed8751c773b9568218f78c90f13348a"));
executeTestParallel("testMultipleEvalTracksWithoutGenotypes",spec);
}
@ -488,11 +488,32 @@ public class VariantEvalIntegrationTest extends WalkerTest {
"-o %s"
),
1,
Arrays.asList("41a37636868a838a632559949c5216cf")
Arrays.asList("9726c0c8f19d271cf680f5f16f0926b3")
);
executeTest("testModernVCFWithLargeIndels", spec);
}
@Test
public void testStandardIndelEval() {
WalkerTestSpec spec = new WalkerTestSpec(
buildCommandLine(
"-T VariantEval",
"-R " + b37KGReference,
"-eval " + validationDataLocation + "/NA12878.HiSeq.WGS.b37_decoy.indel.recalibrated.vcf",
"-L 20",
"-noST -ST Sample -ST OneBPIndel -ST TandemRepeat",
"-noEV -EV IndelSummary -EV IndelLengthHistogram",
"-gold " + validationDataLocation + "/Mills_and_1000G_gold_standard.indels.b37.sites.vcf",
"-D " + b37dbSNP132,
"-o %s"
),
1,
Arrays.asList("c89705147ef4233d5de3a539469bd1d1")
);
executeTest("testStandardIndelEval", spec);
}
@Test()
public void testIncompatibleEvalAndStrat() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -284,6 +284,18 @@ public class MathUtilsUnitTest extends BaseTest {
Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[] {-1.0, -3.0, -1.0, -2.0}), new double[] {0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211}));
}
@Test
public void testLog10sumLog10() {
final double log3 = 0.477121254719662;
Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3), 0);
Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3), 0);
Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3), 0);
final double log2 = 0.301029995663981;
Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2), 0);
Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0), 0);
}
@Test
public void testDotProduct() {
Assert.assertEquals(MathUtils.dotProduct(new Double[]{-5.0,-3.0,2.0}, new Double[]{6.0,7.0,8.0}),-35.0,1e-3);

View File

@ -20,7 +20,7 @@ public class BaseRecalibrationUnitTest {
@Test(enabled=false)
public void testReadingReport() {
File csv = new File("public/testdata/exampleGATKREPORT.grp");
BaseRecalibration baseRecalibration = new BaseRecalibration(csv);
BaseRecalibration baseRecalibration = new BaseRecalibration(csv, -1);
GATKSAMRecord read = ReadUtils.createRandomRead(1000);
read.setReadGroup(new GATKSAMReadGroupRecord(new SAMReadGroupRecord("exampleBAM.bam.bam"), NGSPlatform.ILLUMINA));
baseRecalibration.recalibrateRead(read);

View File

@ -589,4 +589,76 @@ public class VariantContextUtilsUnitTest extends BaseTest {
return priority;
}
// --------------------------------------------------------------------------------
//
// Test repeats
//
// --------------------------------------------------------------------------------
private class RepeatDetectorTest extends TestDataProvider {
String ref;
boolean isTrueRepeat;
VariantContext vc;
private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) {
super(RepeatDetectorTest.class);
this.ref = "N" + ref; // add a dummy base for the event here
this.isTrueRepeat = isTrueRepeat;
List<Allele> alleles = new LinkedList<Allele>();
final Allele refAllele = Allele.create(refAlleleString, true);
alleles.add(refAllele);
for ( final String altString: altAlleleStrings) {
final Allele alt = Allele.create(altString, false);
alleles.add(alt);
}
VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles);
this.vc = builder.make();
}
public String toString() {
return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc);
}
}
@DataProvider(name = "RepeatDetectorTest")
public Object[][] makeRepeatDetectorTest() {
new RepeatDetectorTest(true, "AAC", "-", "A");
new RepeatDetectorTest(true, "AAC", "A", "-");
new RepeatDetectorTest(false, "AAC", "AA", "-");
new RepeatDetectorTest(false, "AAC", "-", "C");
new RepeatDetectorTest(false, "AAC", "A", "C");
// running out of ref bases => false
new RepeatDetectorTest(false, "AAC", "-", "CAGTA");
// complex repeats
new RepeatDetectorTest(true, "ATATATC", "-", "AT");
new RepeatDetectorTest(true, "ATATATC", "-", "ATA");
new RepeatDetectorTest(true, "ATATATC", "-", "ATAT");
new RepeatDetectorTest(true, "ATATATC", "AT", "-");
new RepeatDetectorTest(false, "ATATATC", "ATA", "-");
new RepeatDetectorTest(false, "ATATATC", "ATAT", "-");
// multi-allelic
new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATAT");
new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATA");
new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATAT");
new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATA"); // two As
new RepeatDetectorTest(false, "ATATATC", "AT", "-", "ATC"); // false
new RepeatDetectorTest(false, "ATATATC", "AT", "-", "CC"); // false
new RepeatDetectorTest(false, "ATATATC", "AT", "ATAT", "CC"); // false
return RepeatDetectorTest.getTests(RepeatDetectorTest.class);
}
@Test(dataProvider = "RepeatDetectorTest")
public void testRepeatDetectorTest(RepeatDetectorTest cfg) {
// test alleles are equal
Assert.assertEquals(VariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat);
}
}

File diff suppressed because it is too large Load Diff