Merge pull request #377 from broadinstitute/eb_experiment_with_pcr_error_model
Eb experiment with pcr error model
This commit is contained in:
commit
a21d5252c8
|
|
@ -472,6 +472,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
@Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
|
||||
protected int PADDING_AROUND_SNPS_FOR_CALLING = 20;
|
||||
|
||||
/**
|
||||
* Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base
|
||||
* insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator).
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false)
|
||||
public LikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
|
||||
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
// done with Haplotype caller parameters
|
||||
// -----------------------------------------------------------------------------------------------
|
||||
|
|
@ -624,7 +632,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
|
|||
}
|
||||
|
||||
// create our likelihood calculation engine
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel );
|
||||
|
||||
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();
|
||||
|
||||
|
|
|
|||
|
|
@ -63,6 +63,8 @@ import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
|
|||
import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.variant.variantcontext.Allele;
|
||||
|
|
@ -76,6 +78,8 @@ import java.util.*;
|
|||
public class LikelihoodCalculationEngine {
|
||||
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
|
||||
|
||||
private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
|
||||
|
||||
private final byte constantGCP;
|
||||
private final double log10globalReadMismappingRate;
|
||||
private final boolean DEBUG;
|
||||
|
|
@ -104,6 +108,17 @@ public class LikelihoodCalculationEngine {
|
|||
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
|
||||
private final PrintStream likelihoodsStream;
|
||||
|
||||
public enum PCR_ERROR_MODEL {
|
||||
/** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */
|
||||
NONE,
|
||||
/** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */
|
||||
AGGRESSIVE,
|
||||
/** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */
|
||||
CONSERVATIVE
|
||||
}
|
||||
|
||||
private final PCR_ERROR_MODEL pcrErrorModel;
|
||||
|
||||
/**
|
||||
* The expected rate of random sequencing errors for a read originating from its true haplotype.
|
||||
*
|
||||
|
|
@ -127,12 +142,15 @@ public class LikelihoodCalculationEngine {
|
|||
* assigned a likelihood of -13.
|
||||
* @param noFpga disable FPGA acceleration
|
||||
*/
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga ) {
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) {
|
||||
this.hmmType = hmmType;
|
||||
this.constantGCP = constantGCP;
|
||||
this.DEBUG = debug;
|
||||
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
|
||||
this.noFpga = noFpga;
|
||||
this.pcrErrorModel = pcrErrorModel;
|
||||
|
||||
initializePCRErrorModel();
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
try {
|
||||
|
|
@ -145,20 +163,10 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
}
|
||||
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) {
|
||||
this(constantGCP, debug, hmmType, log10globalReadMismappingRate, false);
|
||||
}
|
||||
|
||||
public LikelihoodCalculationEngine() {
|
||||
this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, false);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if ( likelihoodsStream != null ) likelihoodsStream.close();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
|
||||
*
|
||||
|
|
@ -196,11 +204,7 @@ public class LikelihoodCalculationEngine {
|
|||
// evaluate the likelihood of the reads given those haplotypes
|
||||
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
|
||||
|
||||
final List<GATKSAMRecord> removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
|
||||
// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey());
|
||||
// for ( final GATKSAMRecord read : removedReads )
|
||||
// logger.info("\tRemoved " + read.getReadName());
|
||||
|
||||
map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
|
||||
stratifiedReadMap.put(sampleEntry.getKey(), map);
|
||||
}
|
||||
|
||||
|
|
@ -222,22 +226,27 @@ public class LikelihoodCalculationEngine {
|
|||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
|
||||
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
|
||||
final byte[] readQuals = read.getBaseQualities().clone();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
|
||||
|
||||
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
|
||||
|
||||
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
|
||||
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
|
||||
// TODO -- why is Q18 hard-coded here???
|
||||
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
|
||||
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
|
||||
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
|
||||
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
|
||||
}
|
||||
|
||||
if ( batchPairHMM != null ) {
|
||||
batchPairHMM.batchAdd(haplotypes, read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP);
|
||||
batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
|
||||
batchedReads.add(read);
|
||||
continue;
|
||||
}
|
||||
|
|
@ -251,12 +260,12 @@ public class LikelihoodCalculationEngine {
|
|||
final Haplotype haplotype = haplotypes.get(jjj);
|
||||
final boolean isFirstHaplotype = jjj == 0;
|
||||
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
|
||||
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
|
||||
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
|
||||
|
||||
if ( WRITE_LIKELIHOODS_TO_FILE ) {
|
||||
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
|
||||
haplotype.getBaseString(),
|
||||
new String(read.getReadBases()),
|
||||
new String(readBases),
|
||||
SAMUtils.phredToFastq(readQuals),
|
||||
SAMUtils.phredToFastq(readInsQuals),
|
||||
SAMUtils.phredToFastq(readDelQuals),
|
||||
|
|
@ -524,4 +533,48 @@ public class LikelihoodCalculationEngine {
|
|||
}
|
||||
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Experimental attempts at PCR error rate modeling
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
protected static final int MAX_STR_UNIT_LENGTH = 8;
|
||||
protected static final int MAX_REPEAT_LENGTH = 20;
|
||||
protected static final int MIN_ADJUSTED_QSCORE = 10;
|
||||
protected static final double INITIAL_QSCORE = 40.0;
|
||||
|
||||
private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1];
|
||||
private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
|
||||
|
||||
private void initializePCRErrorModel() {
|
||||
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
|
||||
return;
|
||||
|
||||
repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH);
|
||||
|
||||
pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1];
|
||||
|
||||
final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0;
|
||||
|
||||
for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ )
|
||||
pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor);
|
||||
}
|
||||
|
||||
protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) {
|
||||
return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 ));
|
||||
}
|
||||
|
||||
protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {
|
||||
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
|
||||
return;
|
||||
|
||||
for ( int iii = 1; iii < readBases.length; iii++ ) {
|
||||
final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond();
|
||||
readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
|
||||
readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -74,6 +74,11 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
|
|||
MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH;
|
||||
}
|
||||
|
||||
public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) {
|
||||
this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH;
|
||||
this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
|
||||
// store the original bases and then write Ns over low quality ones
|
||||
|
|
@ -103,7 +108,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
|
|||
|
||||
}
|
||||
|
||||
private Pair<byte[], Integer> findTandemRepeatUnits(byte[] readBases, int offset) {
|
||||
public Pair<byte[], Integer> findTandemRepeatUnits(byte[] readBases, int offset) {
|
||||
int maxBW = 0;
|
||||
byte[] bestBWRepeatUnit = new byte[]{readBases[offset]};
|
||||
for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) {
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal
|
|||
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
|
||||
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
|
||||
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
|
||||
}
|
||||
|
|
@ -68,7 +68,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
|
||||
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
|
||||
}
|
||||
|
|
@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
|
|||
}
|
||||
|
||||
private void HCTestComplexGGA(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,7 +79,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
|||
*/
|
||||
@Test(dataProvider = "MyDataProvider")
|
||||
public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) {
|
||||
final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header",
|
||||
final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header",
|
||||
b37KGReference, bam, intervals, mode);
|
||||
final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
|
||||
|
|
@ -88,7 +88,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testERCRegionWithNoCalledHaplotypes() {
|
||||
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF",
|
||||
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF",
|
||||
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001");
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
|
||||
spec.disableShadowBCF();
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
|
||||
|
||||
private void HCTest(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCaller: args=" + args, spec);
|
||||
}
|
||||
|
|
@ -108,7 +108,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
|
||||
}
|
||||
|
|
@ -123,7 +123,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
|
||||
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
|
||||
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) {
|
||||
if( containsDuplicateRecord(vcf, parser) ) {
|
||||
|
|
@ -161,14 +161,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
// any of the calls in that region because it is so messy.
|
||||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
|
@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void HCTestDanglingTailMergingForDeletions() throws IOException {
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800";
|
||||
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList(""));
|
||||
final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0);
|
||||
|
||||
|
|
@ -210,7 +210,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
|
@ -218,7 +218,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testReducedBamWithReadsNotFullySpanningDeletion() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
|
||||
Arrays.asList("6a9222905c740b9208bf3c67478514eb"));
|
||||
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
|
||||
}
|
||||
|
|
@ -232,7 +232,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestDBSNPAnnotationWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
|
||||
Arrays.asList("f3e636d64042e766cc6515987e85a968"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
|
||||
}
|
||||
|
|
@ -240,9 +240,31 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void HCTestDBSNPAnnotationWEx() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
|
||||
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
|
||||
Arrays.asList("1352cbe1404aefc94eb8e044539a9882"));
|
||||
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// test PCR indel model
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void HCTestAggressivePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb"));
|
||||
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestConservativePcrIndelModelWGS() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
|
||||
Arrays.asList("16f7ffa063511c70bad795639a1c2638"));
|
||||
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,7 +70,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
|
|||
@Test(dataProvider = "NCTDataProvider")
|
||||
public void testHCNCT(final int nct, final String md5) {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I "
|
||||
"-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I "
|
||||
+ privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " +
|
||||
" -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1,
|
||||
Arrays.asList(md5));
|
||||
|
|
|
|||
|
|
@ -54,9 +54,18 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
|
||||
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Unit tests for LikelihoodCalculationEngine
|
||||
*/
|
||||
|
|
@ -93,6 +102,45 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
|
|||
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2));
|
||||
}
|
||||
|
||||
@DataProvider(name = "PcrErrorModelTestProvider")
|
||||
public Object[][] createPcrErrorModelTestData() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) {
|
||||
for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) {
|
||||
tests.add(new Object[]{repeat, repeatLength});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PcrErrorModelTestProvider", enabled = true)
|
||||
public void createPcrErrorModelTest(final String repeat, final int repeatLength) {
|
||||
|
||||
final LikelihoodCalculationEngine engine = new LikelihoodCalculationEngine((byte)0, false, PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE);
|
||||
|
||||
final String readString = Utils.dupString(repeat, repeatLength);
|
||||
final byte[] insQuals = new byte[readString.length()];
|
||||
final byte[] delQuals = new byte[readString.length()];
|
||||
Arrays.fill(insQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
|
||||
Arrays.fill(delQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
|
||||
|
||||
engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals);
|
||||
|
||||
final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
|
||||
repeatCovariate.initialize(LikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, LikelihoodCalculationEngine.MAX_REPEAT_LENGTH);
|
||||
|
||||
for ( int i = 1; i < insQuals.length; i++ ) {
|
||||
|
||||
final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond();
|
||||
final byte adjustedScore = LikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0);
|
||||
|
||||
Assert.assertEquals(insQuals[i-1], adjustedScore);
|
||||
Assert.assertEquals(delQuals[i-1], adjustedScore);
|
||||
}
|
||||
}
|
||||
|
||||
// BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests!
|
||||
/*
|
||||
private class BasicLikelihoodTestProvider extends TestDataProvider {
|
||||
|
|
|
|||
|
|
@ -124,7 +124,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
|
||||
/**
|
||||
* -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL).
|
||||
* Note this argument accepts any number of inputs. So -F GQ -F PL is allowed.
|
||||
* Note this argument accepts any number of inputs. So -GF GQ -GF PL is allowed.
|
||||
*/
|
||||
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false)
|
||||
public List<String> genotypeFieldsToTake = new ArrayList<String>();
|
||||
|
|
@ -448,10 +448,6 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
|
||||
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
|
||||
getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } });
|
||||
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
|
||||
return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError());
|
||||
}});
|
||||
}
|
||||
|
||||
private static Object splitAltAlleles(VariantContext vc) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue