Merge pull request #377 from broadinstitute/eb_experiment_with_pcr_error_model

Eb experiment with pcr error model
This commit is contained in:
Ryan Poplin 2013-08-16 08:38:55 -07:00
commit a21d5252c8
9 changed files with 180 additions and 48 deletions

View File

@ -472,6 +472,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
protected int PADDING_AROUND_SNPS_FOR_CALLING = 20;
/**
* Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base
* insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator).
*/
@Advanced
@Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false)
public LikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
// -----------------------------------------------------------------------------------------------
// done with Haplotype caller parameters
// -----------------------------------------------------------------------------------------------
@ -624,7 +632,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
}
// create our likelihood calculation engine
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel );
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();

View File

@ -63,6 +63,8 @@ import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.Allele;
@ -76,6 +78,8 @@ import java.util.*;
public class LikelihoodCalculationEngine {
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
private final byte constantGCP;
private final double log10globalReadMismappingRate;
private final boolean DEBUG;
@ -104,6 +108,17 @@ public class LikelihoodCalculationEngine {
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
private final PrintStream likelihoodsStream;
public enum PCR_ERROR_MODEL {
/** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */
NONE,
/** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */
AGGRESSIVE,
/** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */
CONSERVATIVE
}
private final PCR_ERROR_MODEL pcrErrorModel;
/**
* The expected rate of random sequencing errors for a read originating from its true haplotype.
*
@ -127,12 +142,15 @@ public class LikelihoodCalculationEngine {
* assigned a likelihood of -13.
* @param noFpga disable FPGA acceleration
*/
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga ) {
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) {
this.hmmType = hmmType;
this.constantGCP = constantGCP;
this.DEBUG = debug;
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
this.noFpga = noFpga;
this.pcrErrorModel = pcrErrorModel;
initializePCRErrorModel();
if ( WRITE_LIKELIHOODS_TO_FILE ) {
try {
@ -145,20 +163,10 @@ public class LikelihoodCalculationEngine {
}
}
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) {
this(constantGCP, debug, hmmType, log10globalReadMismappingRate, false);
}
public LikelihoodCalculationEngine() {
this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, false);
}
public void close() {
if ( likelihoodsStream != null ) likelihoodsStream.close();
}
/**
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
*
@ -196,11 +204,7 @@ public class LikelihoodCalculationEngine {
// evaluate the likelihood of the reads given those haplotypes
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
final List<GATKSAMRecord> removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey());
// for ( final GATKSAMRecord read : removedReads )
// logger.info("\tRemoved " + read.getReadName());
map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
stratifiedReadMap.put(sampleEntry.getKey(), map);
}
@ -222,22 +226,27 @@ public class LikelihoodCalculationEngine {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final GATKSAMRecord read : reads ) {
final byte[] readBases = read.getReadBases();
final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities();
final byte[] readDelQuals = read.getBaseDeletionQualities();
final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
// TODO -- why is Q18 hard-coded here???
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
}
if ( batchPairHMM != null ) {
batchPairHMM.batchAdd(haplotypes, read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP);
batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
batchedReads.add(read);
continue;
}
@ -251,12 +260,12 @@ public class LikelihoodCalculationEngine {
final Haplotype haplotype = haplotypes.get(jjj);
final boolean isFirstHaplotype = jjj == 0;
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(),
new String(read.getReadBases()),
new String(readBases),
SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals),
@ -524,4 +533,48 @@ public class LikelihoodCalculationEngine {
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
// --------------------------------------------------------------------------------
//
// Experimental attempts at PCR error rate modeling
//
// --------------------------------------------------------------------------------
protected static final int MAX_STR_UNIT_LENGTH = 8;
protected static final int MAX_REPEAT_LENGTH = 20;
protected static final int MIN_ADJUSTED_QSCORE = 10;
protected static final double INITIAL_QSCORE = 40.0;
private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1];
private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
private void initializePCRErrorModel() {
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
return;
repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH);
pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1];
final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0;
for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ )
pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor);
}
protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) {
return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 ));
}
protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
return;
for ( int iii = 1; iii < readBases.length; iii++ ) {
final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond();
readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
}
}
}

View File

@ -74,6 +74,11 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH;
}
public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) {
this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH;
this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH;
}
@Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
// store the original bases and then write Ns over low quality ones
@ -103,7 +108,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
}
private Pair<byte[], Integer> findTandemRepeatUnits(byte[] readBases, int offset) {
public Pair<byte[], Integer> findTandemRepeatUnits(byte[] readBases, int offset) {
int maxBW = 0;
byte[] bestBWRepeatUnit = new byte[]{readBases[offset]};
for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) {

View File

@ -57,7 +57,7 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
private void HCTestComplexVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
}
@ -68,7 +68,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
}
@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
}
private void HCTestComplexGGA(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
}

View File

@ -79,7 +79,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
*/
@Test(dataProvider = "MyDataProvider")
public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) {
final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header",
final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header",
b37KGReference, bam, intervals, mode);
final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
@ -88,7 +88,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test
public void testERCRegionWithNoCalledHaplotypes() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF",
final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF",
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
spec.disableShadowBCF();

View File

@ -76,7 +76,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCaller: args=" + args, spec);
}
@ -108,7 +108,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
}
@ -123,7 +123,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) {
if( containsDuplicateRecord(vcf, parser) ) {
@ -161,14 +161,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
// any of the calls in that region because it is so messy.
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestDanglingTailMergingForDeletions() throws IOException {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800";
final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800";
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList(""));
final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0);
@ -210,7 +210,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -218,7 +218,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("6a9222905c740b9208bf3c67478514eb"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}
@ -232,7 +232,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("f3e636d64042e766cc6515987e85a968"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
}
@ -240,9 +240,31 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestDBSNPAnnotationWEx() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
"-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("1352cbe1404aefc94eb8e044539a9882"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// test PCR indel model
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@Test
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("16f7ffa063511c70bad795639a1c2638"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
}

View File

@ -70,7 +70,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
@Test(dataProvider = "NCTDataProvider")
public void testHCNCT(final int nct, final String md5) {
WalkerTestSpec spec = new WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I "
"-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I "
+ privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " +
" -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1,
Arrays.asList(md5));

View File

@ -54,9 +54,18 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Unit tests for LikelihoodCalculationEngine
*/
@ -93,6 +102,45 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2));
}
@DataProvider(name = "PcrErrorModelTestProvider")
public Object[][] createPcrErrorModelTestData() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) {
for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) {
tests.add(new Object[]{repeat, repeatLength});
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "PcrErrorModelTestProvider", enabled = true)
public void createPcrErrorModelTest(final String repeat, final int repeatLength) {
final LikelihoodCalculationEngine engine = new LikelihoodCalculationEngine((byte)0, false, PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE);
final String readString = Utils.dupString(repeat, repeatLength);
final byte[] insQuals = new byte[readString.length()];
final byte[] delQuals = new byte[readString.length()];
Arrays.fill(insQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
Arrays.fill(delQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals);
final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
repeatCovariate.initialize(LikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, LikelihoodCalculationEngine.MAX_REPEAT_LENGTH);
for ( int i = 1; i < insQuals.length; i++ ) {
final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond();
final byte adjustedScore = LikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0);
Assert.assertEquals(insQuals[i-1], adjustedScore);
Assert.assertEquals(delQuals[i-1], adjustedScore);
}
}
// BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests!
/*
private class BasicLikelihoodTestProvider extends TestDataProvider {

View File

@ -124,7 +124,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
/**
* -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL).
* Note this argument accepts any number of inputs. So -F GQ -F PL is allowed.
* Note this argument accepts any number of inputs. So -GF GQ -GF PL is allowed.
*/
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false)
public List<String> genotypeFieldsToTake = new ArrayList<String>();
@ -448,10 +448,6 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } });
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError());
}});
}
private static Object splitAltAlleles(VariantContext vc) {