Merge pull request #377 from broadinstitute/eb_experiment_with_pcr_error_model

Eb experiment with pcr error model
This commit is contained in:
Ryan Poplin 2013-08-16 08:38:55 -07:00
commit a21d5252c8
9 changed files with 180 additions and 48 deletions

View File

@ -472,6 +472,14 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
@Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false) @Argument(fullName="paddingAroundSNPs", shortName="paddingAroundSNPs", doc = "Include at least this many bases around an event for calling snps", required=false)
protected int PADDING_AROUND_SNPS_FOR_CALLING = 20; protected int PADDING_AROUND_SNPS_FOR_CALLING = 20;
/**
* Which PCR indel error model should we use when calculating likelihoods? If NONE is selected, then the default base
* insertion/deletion qualities will be used (or taken from the read if generated through the BaseRecalibrator).
*/
@Advanced
@Argument(fullName = "pcr_indel_model", shortName = "pcrModel", doc = "The PCR indel model to use", required = false)
public LikelihoodCalculationEngine.PCR_ERROR_MODEL pcrErrorModel = LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE;
// ----------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------
// done with Haplotype caller parameters // done with Haplotype caller parameters
// ----------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------
@ -624,7 +632,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<List<VariantContext>, In
} }
// create our likelihood calculation engine // create our likelihood calculation engine
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM, log10GlobalReadMismappingRate, noFpga, pcrErrorModel );
final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes(); final MergeVariantsAcrossHaplotypes variantMerger = mergeVariantsViaLD ? new LDMerger(DEBUG, 10, 1) : new MergeVariantsAcrossHaplotypes();

View File

@ -63,6 +63,8 @@ import org.broadinstitute.sting.utils.pairhmm.LoglessPairHMM;
import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM; import org.broadinstitute.sting.utils.pairhmm.CnyPairHMM;
import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM; import org.broadinstitute.sting.utils.pairhmm.BatchPairHMM;
import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.Allele;
@ -76,6 +78,8 @@ import java.util.*;
public class LikelihoodCalculationEngine { public class LikelihoodCalculationEngine {
private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class); private final static Logger logger = Logger.getLogger(LikelihoodCalculationEngine.class);
private static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual
private final byte constantGCP; private final byte constantGCP;
private final double log10globalReadMismappingRate; private final double log10globalReadMismappingRate;
private final boolean DEBUG; private final boolean DEBUG;
@ -104,6 +108,17 @@ public class LikelihoodCalculationEngine {
private final static String LIKELIHOODS_FILENAME = "likelihoods.txt"; private final static String LIKELIHOODS_FILENAME = "likelihoods.txt";
private final PrintStream likelihoodsStream; private final PrintStream likelihoodsStream;
public enum PCR_ERROR_MODEL {
/** no specialized PCR error model will be applied; if base insertion/deletion qualities are present they will be used */
NONE,
/** a more aggressive model will be applied that sacrifices true positives in order to remove more false positives */
AGGRESSIVE,
/** a less aggressive model will be applied that tries to maintain a high true positive rate at the expense of allowing more false positives */
CONSERVATIVE
}
private final PCR_ERROR_MODEL pcrErrorModel;
/** /**
* The expected rate of random sequencing errors for a read originating from its true haplotype. * The expected rate of random sequencing errors for a read originating from its true haplotype.
* *
@ -127,12 +142,15 @@ public class LikelihoodCalculationEngine {
* assigned a likelihood of -13. * assigned a likelihood of -13.
* @param noFpga disable FPGA acceleration * @param noFpga disable FPGA acceleration
*/ */
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga ) { public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate, final boolean noFpga, final PCR_ERROR_MODEL pcrErrorModel ) {
this.hmmType = hmmType; this.hmmType = hmmType;
this.constantGCP = constantGCP; this.constantGCP = constantGCP;
this.DEBUG = debug; this.DEBUG = debug;
this.log10globalReadMismappingRate = log10globalReadMismappingRate; this.log10globalReadMismappingRate = log10globalReadMismappingRate;
this.noFpga = noFpga; this.noFpga = noFpga;
this.pcrErrorModel = pcrErrorModel;
initializePCRErrorModel();
if ( WRITE_LIKELIHOODS_TO_FILE ) { if ( WRITE_LIKELIHOODS_TO_FILE ) {
try { try {
@ -145,20 +163,10 @@ public class LikelihoodCalculationEngine {
} }
} }
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType, final double log10globalReadMismappingRate ) {
this(constantGCP, debug, hmmType, log10globalReadMismappingRate, false);
}
public LikelihoodCalculationEngine() {
this((byte)10, false, PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING, -3, false);
}
public void close() { public void close() {
if ( likelihoodsStream != null ) likelihoodsStream.close(); if ( likelihoodsStream != null ) likelihoodsStream.close();
} }
/** /**
* Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate * Initialize our pairHMM with parameters appropriate to the haplotypes and reads we're going to evaluate
* *
@ -196,11 +204,7 @@ public class LikelihoodCalculationEngine {
// evaluate the likelihood of the reads given those haplotypes // evaluate the likelihood of the reads given those haplotypes
final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue()); final PerReadAlleleLikelihoodMap map = computeReadLikelihoods(haplotypes, sampleEntry.getValue());
final List<GATKSAMRecord> removedReads = map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE);
// logger.info("Removed " + removedReads.size() + " reads because of bad likelihoods from sample " + sampleEntry.getKey());
// for ( final GATKSAMRecord read : removedReads )
// logger.info("\tRemoved " + read.getReadName());
stratifiedReadMap.put(sampleEntry.getKey(), map); stratifiedReadMap.put(sampleEntry.getKey(), map);
} }
@ -222,22 +226,27 @@ public class LikelihoodCalculationEngine {
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
for( final GATKSAMRecord read : reads ) { for( final GATKSAMRecord read : reads ) {
final byte[] readBases = read.getReadBases();
final byte[] overallGCP = new byte[read.getReadLength()]; final byte[] overallGCP = new byte[read.getReadLength()];
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data? Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
// NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read // NOTE -- must clone anything that gets modified here so we don't screw up future uses of the read
final byte[] readQuals = read.getBaseQualities().clone(); final byte[] readQuals = read.getBaseQualities().clone();
final byte[] readInsQuals = read.getBaseInsertionQualities(); final byte[] readInsQuals = read.getBaseInsertionQualities().clone();
final byte[] readDelQuals = read.getBaseDeletionQualities(); final byte[] readDelQuals = read.getBaseDeletionQualities().clone();
applyPCRErrorModel(readBases, readInsQuals, readDelQuals);
for( int kkk = 0; kkk < readQuals.length; kkk++ ) { for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] );
// TODO -- why is Q18 hard-coded here??? readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] );
readQuals[kkk] = ( readQuals[kkk] < (byte) 18 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
} }
if ( batchPairHMM != null ) { if ( batchPairHMM != null ) {
batchPairHMM.batchAdd(haplotypes, read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP); batchPairHMM.batchAdd(haplotypes, readBases, readQuals, readInsQuals, readDelQuals, overallGCP);
batchedReads.add(read); batchedReads.add(read);
continue; continue;
} }
@ -251,12 +260,12 @@ public class LikelihoodCalculationEngine {
final Haplotype haplotype = haplotypes.get(jjj); final Haplotype haplotype = haplotypes.get(jjj);
final boolean isFirstHaplotype = jjj == 0; final boolean isFirstHaplotype = jjj == 0;
final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), final double log10l = pairHMM.get().computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(),
read.getReadBases(), readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype); readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype);
if ( WRITE_LIKELIHOODS_TO_FILE ) { if ( WRITE_LIKELIHOODS_TO_FILE ) {
likelihoodsStream.printf("%s %s %s %s %s %s %f%n", likelihoodsStream.printf("%s %s %s %s %s %s %f%n",
haplotype.getBaseString(), haplotype.getBaseString(),
new String(read.getReadBases()), new String(readBases),
SAMUtils.phredToFastq(readQuals), SAMUtils.phredToFastq(readQuals),
SAMUtils.phredToFastq(readInsQuals), SAMUtils.phredToFastq(readInsQuals),
SAMUtils.phredToFastq(readDelQuals), SAMUtils.phredToFastq(readDelQuals),
@ -524,4 +533,48 @@ public class LikelihoodCalculationEngine {
} }
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" ); throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
} }
// --------------------------------------------------------------------------------
//
// Experimental attempts at PCR error rate modeling
//
// --------------------------------------------------------------------------------
protected static final int MAX_STR_UNIT_LENGTH = 8;
protected static final int MAX_REPEAT_LENGTH = 20;
protected static final int MIN_ADJUSTED_QSCORE = 10;
protected static final double INITIAL_QSCORE = 40.0;
private byte[] pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH * MAX_STR_UNIT_LENGTH + 1];
private final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
private void initializePCRErrorModel() {
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
return;
repeatCovariate.initialize(MAX_STR_UNIT_LENGTH, MAX_REPEAT_LENGTH);
pcrIndelErrorModelCache = new byte[MAX_REPEAT_LENGTH + 1];
final double rateFactor = pcrErrorModel == PCR_ERROR_MODEL.AGGRESSIVE ? 2.0 : 3.0;
for( int iii = 0; iii <= MAX_REPEAT_LENGTH; iii++ )
pcrIndelErrorModelCache[iii] = getErrorModelAdjustedQual(iii, rateFactor);
}
protected static byte getErrorModelAdjustedQual(final int repeatLength, final double rateFactor) {
return (byte) Math.max(MIN_ADJUSTED_QSCORE, MathUtils.fastRound( INITIAL_QSCORE - Math.exp(((double) repeatLength) / (rateFactor * Math.PI)) + 1.0 ));
}
protected void applyPCRErrorModel( final byte[] readBases, final byte[] readInsQuals, final byte[] readDelQuals ) {
if ( pcrErrorModel == PCR_ERROR_MODEL.NONE )
return;
for ( int iii = 1; iii < readBases.length; iii++ ) {
final int repeatLength = repeatCovariate.findTandemRepeatUnits(readBases, iii-1).getSecond();
readInsQuals[iii-1] = (byte) Math.min(0xff & readInsQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
readDelQuals[iii-1] = (byte) Math.min(0xff & readDelQuals[iii-1], 0xff & pcrIndelErrorModelCache[repeatLength]);
}
}
} }

View File

@ -74,6 +74,11 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH;
} }
public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) {
this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH;
this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH;
}
@Override @Override
public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { public void recordValues(final GATKSAMRecord read, final ReadCovariates values) {
// store the original bases and then write Ns over low quality ones // store the original bases and then write Ns over low quality ones
@ -103,7 +108,7 @@ public abstract class RepeatCovariate implements ExperimentalCovariate {
} }
private Pair<byte[], Integer> findTandemRepeatUnits(byte[] readBases, int offset) { public Pair<byte[], Integer> findTandemRepeatUnits(byte[] readBases, int offset) {
int maxBW = 0; int maxBW = 0;
byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; byte[] bestBWRepeatUnit = new byte[]{readBases[offset]};
for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) {

View File

@ -57,7 +57,7 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest { public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
private void HCTestComplexVariants(String bam, String args, String md5) { private void HCTestComplexVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4"; final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
} }
@ -68,7 +68,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
} }
private void HCTestSymbolicVariants(String bam, String args, String md5) { private void HCTestSymbolicVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1"; final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5)); final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
} }
@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
} }
private void HCTestComplexGGA(String bam, String args, String md5) { private void HCTestComplexGGA(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf"; final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec); executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
} }

View File

@ -79,7 +79,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
*/ */
@Test(dataProvider = "MyDataProvider") @Test(dataProvider = "MyDataProvider")
public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) { public void testHCWithGVCF(String bam, HaplotypeCaller.ReferenceConfidenceMode mode, String intervals, String md5) {
final String commandLine = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s %s -ERC %s --no_cmdline_in_header", final String commandLine = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s %s -ERC %s --no_cmdline_in_header",
b37KGReference, bam, intervals, mode); b37KGReference, bam, intervals, mode);
final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode; final String name = "testHCWithGVCF bam=" + bam + " intervals= " + intervals + " gvcf= " + mode;
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5)); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(md5));
@ -88,7 +88,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest {
@Test @Test
public void testERCRegionWithNoCalledHaplotypes() { public void testERCRegionWithNoCalledHaplotypes() {
final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s -ERC GVCF", final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -R %s -I %s -L %s -ERC GVCF",
b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001"); b37KGReference, privateTestDir + "noCallRefModel.bam", "20:17000001-18000001");
final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList(""));
spec.disableShadowBCF(); spec.disableShadowBCF();

View File

@ -76,7 +76,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
private void HCTest(String bam, String args, String md5) { private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final String base = String.format("-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCaller: args=" + args, spec); executeTest("testHaplotypeCaller: args=" + args, spec);
} }
@ -108,7 +108,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
} }
private void HCTestIndelQualityScores(String bam, String args, String md5) { private void HCTestIndelQualityScores(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
} }
@ -123,7 +123,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference)); final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary()); final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s"; final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) { for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) {
if( containsDuplicateRecord(vcf, parser) ) { if( containsDuplicateRecord(vcf, parser) ) {
@ -161,14 +161,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
// any of the calls in that region because it is so messy. // any of the calls in that region because it is so messy.
@Test @Test
public void HCTestProblematicReadsModifiedInActiveRegions() { public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a")); final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("976463812534ac164a64c5d0c3ec988a"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
} }
@Test @Test
public void HCTestStructuralIndels() { public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320")); final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("91717e5e271742c2c9b67223e58f1320"));
executeTest("HCTestStructuralIndels: ", spec); executeTest("HCTestStructuralIndels: ", spec);
} }
@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void HCTestDanglingTailMergingForDeletions() throws IOException { public void HCTestDanglingTailMergingForDeletions() throws IOException {
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800"; final String base = String.format("-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R %s -I %s", REF, NA12878_BAM) + " --no_cmdline_in_header -o %s -L 20:10130740-10130800";
final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList("")); final WalkerTestSpec spec = new WalkerTestSpec(base, 1, Arrays.asList(""));
final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0); final File outputVCF = executeTest("HCTestDanglingTailMergingForDeletions", spec).getFirst().get(0);
@ -210,7 +210,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void HCTestReducedBam() { public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e")); Arrays.asList("277aa95b01fa4d4e0086a2fabf7f3d7e"));
executeTest("HC calling on a ReducedRead BAM", spec); executeTest("HC calling on a ReducedRead BAM", spec);
} }
@ -218,7 +218,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testReducedBamWithReadsNotFullySpanningDeletion() { public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("6a9222905c740b9208bf3c67478514eb")); Arrays.asList("6a9222905c740b9208bf3c67478514eb"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
} }
@ -232,7 +232,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void HCTestDBSNPAnnotationWGS() { public void HCTestDBSNPAnnotationWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1, "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-10,100,000 -D " + b37dbSNP132, 1,
Arrays.asList("f3e636d64042e766cc6515987e85a968")); Arrays.asList("f3e636d64042e766cc6515987e85a968"));
executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec); executeTest("HC calling with dbSNP ID annotation on WGS intervals", spec);
} }
@ -240,9 +240,31 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void HCTestDBSNPAnnotationWEx() { public void HCTestDBSNPAnnotationWEx() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132 "-T HaplotypeCaller --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_PCRFREE + " -o %s -L 20:10,000,000-11,000,000 -D " + b37dbSNP132
+ " -L " + hg19Intervals + " -isr INTERSECTION", 1, + " -L " + hg19Intervals + " -isr INTERSECTION", 1,
Arrays.asList("1352cbe1404aefc94eb8e044539a9882")); Arrays.asList("1352cbe1404aefc94eb8e044539a9882"));
executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec); executeTest("HC calling with dbSNP ID annotation on WEx intervals", spec);
} }
// --------------------------------------------------------------------------------------------------------------
//
// test PCR indel model
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void HCTestAggressivePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model AGGRESSIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("ab49f80783e5db5f9ab6b13ba2ad00cb"));
executeTest("HC calling with aggressive indel error modeling on WGS intervals", spec);
}
@Test
public void HCTestConservativePcrIndelModelWGS() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller --disableDithering --pcr_indel_model CONSERVATIVE -R " + b37KGReference + " --no_cmdline_in_header -I " + NA12878_BAM + " -o %s -L 20:10,000,000-10,300,000", 1,
Arrays.asList("16f7ffa063511c70bad795639a1c2638"));
executeTest("HC calling with conservative indel error modeling on WGS intervals", spec);
}
} }

View File

@ -70,7 +70,7 @@ public class HaplotypeCallerParallelIntegrationTest extends WalkerTest {
@Test(dataProvider = "NCTDataProvider") @Test(dataProvider = "NCTDataProvider")
public void testHCNCT(final int nct, final String md5) { public void testHCNCT(final int nct, final String md5) {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " "-T HaplotypeCaller --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I "
+ privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " + + privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam -o %s " +
" -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1, " -L 20:10,000,000-10,100,000 -G none -A -contamination 0.0 -nct " + nct, 1,
Arrays.asList(md5)); Arrays.asList(md5));

View File

@ -54,9 +54,18 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate;
import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/** /**
* Unit tests for LikelihoodCalculationEngine * Unit tests for LikelihoodCalculationEngine
*/ */
@ -93,6 +102,45 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest {
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2)); Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2));
} }
@DataProvider(name = "PcrErrorModelTestProvider")
public Object[][] createPcrErrorModelTestData() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final String repeat : Arrays.asList("A", "AC", "ACG", "ACGT") ) {
for ( final int repeatLength : Arrays.asList(1, 2, 3, 5, 10, 15) ) {
tests.add(new Object[]{repeat, repeatLength});
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "PcrErrorModelTestProvider", enabled = true)
public void createPcrErrorModelTest(final String repeat, final int repeatLength) {
final LikelihoodCalculationEngine engine = new LikelihoodCalculationEngine((byte)0, false, PairHMM.HMM_IMPLEMENTATION.ORIGINAL, 0.0, true, LikelihoodCalculationEngine.PCR_ERROR_MODEL.CONSERVATIVE);
final String readString = Utils.dupString(repeat, repeatLength);
final byte[] insQuals = new byte[readString.length()];
final byte[] delQuals = new byte[readString.length()];
Arrays.fill(insQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
Arrays.fill(delQuals, (byte)LikelihoodCalculationEngine.INITIAL_QSCORE);
engine.applyPCRErrorModel(readString.getBytes(), insQuals, delQuals);
final RepeatCovariate repeatCovariate = new RepeatLengthCovariate();
repeatCovariate.initialize(LikelihoodCalculationEngine.MAX_STR_UNIT_LENGTH, LikelihoodCalculationEngine.MAX_REPEAT_LENGTH);
for ( int i = 1; i < insQuals.length; i++ ) {
final int repeatLengthFromCovariate = repeatCovariate.findTandemRepeatUnits(readString.getBytes(), i-1).getSecond();
final byte adjustedScore = LikelihoodCalculationEngine.getErrorModelAdjustedQual(repeatLengthFromCovariate, 3.0);
Assert.assertEquals(insQuals[i-1], adjustedScore);
Assert.assertEquals(delQuals[i-1], adjustedScore);
}
}
// BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests! // BUGBUG: LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods has changed! Need to make new unit tests!
/* /*
private class BasicLikelihoodTestProvider extends TestDataProvider { private class BasicLikelihoodTestProvider extends TestDataProvider {

View File

@ -124,7 +124,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
/** /**
* -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL). * -GF NAME can be any binding in the FORMAT field (e.g., GQ, PL).
* Note this argument accepts any number of inputs. So -F GQ -F PL is allowed. * Note this argument accepts any number of inputs. So -GF GQ -GF PL is allowed.
*/ */
@Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false) @Argument(fullName="genotypeFields", shortName="GF", doc="The name of each genotype field to capture for output in the table", required=false)
public List<String> genotypeFieldsToTake = new ArrayList<String>(); public List<String> genotypeFieldsToTake = new ArrayList<String>();
@ -448,10 +448,6 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } }); getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } }); getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } }); getters.put("MULTI-ALLELIC", new Getter() { public String get(VariantContext vc) { return Boolean.toString(vc.getAlternateAlleles().size() > 1); } });
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
return String.format("%.2f", -10 * vc.getGenotype(0).getLog10PError());
}});
} }
private static Object splitAltAlleles(VariantContext vc) { private static Object splitAltAlleles(VariantContext vc) {