Merge pull request #193 from broadinstitute/eb_contamination_fixing_for_reduced_reads

Eb contamination fixing for reduced reads
This commit is contained in:
Mark DePristo 2013-04-26 09:48:45 -07:00
commit 071fd67d55
20 changed files with 325 additions and 168 deletions

View File

@ -54,7 +54,6 @@ import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
import java.util.Collections;
import java.util.Map;
@ -170,10 +169,6 @@ public class StandardCallerArgumentCollection {
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
@Hidden
@Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
public PrintStream contaminationLog = null;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
@ -192,7 +187,6 @@ public class StandardCallerArgumentCollection {
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE;
this.contaminationLog = SCAC.contaminationLog;
this.exactCallsLog = SCAC.exactCallsLog;
this.sampleContamination=SCAC.sampleContamination;
this.AFmodel = SCAC.AFmodel;

View File

@ -183,6 +183,6 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR
* @param headerLines
*/
public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set<VCFHeaderLine> headerLines ) {
useDithering = ! toolkit.getArguments().disableRandomization;
useDithering = ! toolkit.getArguments().disableDithering;
}
}

View File

@ -145,7 +145,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
final ReadBackedPileup pileup = context.getBasePileup();
if (pileup != null) {
final GenotypeBuilder b = new GenotypeBuilder(sample.getKey());
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()), UAC.contaminationLog);
final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.getSampleContamination().get(sample.getKey()));
b.PL(genotypeLikelihoods);
b.DP(getFilteredDepth(pileup));
genotypes.add(b.make());

View File

@ -105,7 +105,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
final Double contamination = UAC.getSampleContamination().get(sample.getKey());
if( contamination > 0.0 ) //no need to enter if no contamination reduction
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup,contamination, UAC.contaminationLog);
pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, contamination);
if ( useBAQedPileup )
pileup = createBAQedPileup(pileup);

View File

@ -64,7 +64,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.*;
import java.io.PrintStream;
import java.util.*;
public class GenotypingEngine {
@ -196,13 +195,13 @@ public class GenotypingEngine {
logger.info("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
}
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, UG_engine.getUAC().CONTAMINATION_FRACTION );
final GenotypesContext genotypes = calculateGLsForThisEvent( alleleReadMap, mergedVC );
final VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), mergedVC.isSNP() ? GenotypeLikelihoodsCalculationModel.Model.SNP : GenotypeLikelihoodsCalculationModel.Model.INDEL);
if( call != null ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap_annotations = ( USE_FILTERED_READ_MAP_FOR_ANNOTATIONS ? alleleReadMap :
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0, UG_engine.getUAC().contaminationLog ) );
convertHaplotypeReadMapToAlleleReadMap( haplotypeReadMap, alleleMapper, 0.0 ) );
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = filterToOnlyOverlappingReads( genomeLocParser, alleleReadMap_annotations, perSampleFilteredReadList, call );
VariantContext annotatedCall = call;
@ -406,8 +405,7 @@ public class GenotypingEngine {
// BUGBUG: ugh, too complicated
protected Map<String, PerReadAlleleLikelihoodMap> convertHaplotypeReadMapToAlleleReadMap( final Map<String, PerReadAlleleLikelihoodMap> haplotypeReadMap,
final Map<Allele, List<Haplotype>> alleleMapper,
final double downsamplingFraction,
final PrintStream downsamplingLog ) {
final double downsamplingFraction ) {
final Map<String, PerReadAlleleLikelihoodMap> alleleReadMap = new LinkedHashMap<String, PerReadAlleleLikelihoodMap>();
for( final Map.Entry<String, PerReadAlleleLikelihoodMap> haplotypeReadMapEntry : haplotypeReadMap.entrySet() ) { // for each sample
@ -424,7 +422,7 @@ public class GenotypingEngine {
perReadAlleleLikelihoodMap.add(readEntry.getKey(), alleleMapperEntry.getKey(), maxLikelihood);
}
}
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); // perform contamination downsampling
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); // perform contamination downsampling
alleleReadMap.put(haplotypeReadMapEntry.getKey(), perReadAlleleLikelihoodMap);
}

View File

@ -61,7 +61,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.LinkedHashMap;
import java.util.Map;
@ -213,13 +212,12 @@ public class PairHMMIndelErrorModel {
final ReferenceContext ref,
final int eventLength,
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap,
final double downsamplingFraction,
final PrintStream downsamplingLog) {
final double downsamplingFraction) {
final int numHaplotypes = haplotypeMap.size();
final int readCounts[] = new int[pileup.getNumberOfElements()];
final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction);
return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods);
}

View File

@ -56,8 +56,8 @@ import java.util.List;
public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
private final static String baseCommandIndels = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132;
// --------------------------------------------------------------------------------------------------------------
//
@ -73,7 +73,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("118ed5b54fc9ce1cde89f06a20afebef"));
Arrays.asList("d8b0c5be39ec6b239641c2f2646d2bc3"));
executeTest(String.format("test indel caller in SLX"), spec);
}
@ -88,7 +88,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -minIndelCnt 1" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("6ef59013331bc031ea37807b325d7d2c"));
Arrays.asList("d9572a227ccb13a6baa6dc4fb65bc1e5"));
executeTest(String.format("test indel caller in SLX with low min allele count"), spec);
}
@ -101,7 +101,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,500,000",
1,
Arrays.asList("dd3ee4675377191e34aaf67335e0219a"));
Arrays.asList("54e13f696f56eb742bf449ad11d0dc5f"));
executeTest(String.format("test indel calling, multiple technologies"), spec);
}
@ -111,7 +111,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("bb06ef8262f91664b7d2fe7e1e5df195"));
Arrays.asList("16d975480ff1e689113171805b916b62"));
executeTest("test MultiSample Pilot2 indels with alleles passed in", spec);
}
@ -121,7 +121,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles "
+ privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation +
"pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1,
Arrays.asList("0a2a8cc2d1a79e84624836a31de5491c"));
Arrays.asList("60ed3f8d5bc3f765e6ce3fa698b68bb7"));
executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec);
}
@ -136,7 +136,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation +
"low_coverage_CEU.chr1.10k-11k.bam -o %s -L " + result.get(0).getAbsolutePath(), 1,
Arrays.asList("939f80c6d2dfb592956aed3bdeaf319d"));
Arrays.asList("e87f5c76661527ef7aa44e528fe19573"));
executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2);
}
@ -176,7 +176,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction0() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.0", 1,
Arrays.asList("fc937f92e59dfe07b894411b5dfc166a"));
Arrays.asList("264325878b988acc11d8e5d9d2ba0b7f"));
executeTest("test minIndelFraction 0.0", spec);
}
@ -184,7 +184,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
public void testMinIndelFraction25() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
assessMinIndelFraction + " -minIndelFrac 0.25", 1,
Arrays.asList("41ad9e0edca4b9987390ba5c07f39e4a"));
Arrays.asList("98abcfb0a008050eba8b9c285a25b2a0"));
executeTest("test minIndelFraction 0.25", spec);
}
@ -200,7 +200,7 @@ public class UnifiedGenotyperIndelCallingIntegrationTest extends WalkerTest {
@Test
public void testHaplotype0Length() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null",
"-T UnifiedGenotyper --disableDithering -I " + privateTestDir + "haplotype0.bam -L 20:47507681 -R " + b37KGReference + " -baq CALCULATE_AS_NECESSARY -glm BOTH -o /dev/null",
0,
Collections.<String>emptyList());
executeTest("testHaplotype0Length", spec);

View File

@ -60,8 +60,8 @@ import java.util.Collections;
public class UnifiedGenotyperIntegrationTest extends WalkerTest {
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam";
// --------------------------------------------------------------------------------------------------------------
//
@ -73,15 +73,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testMinBaseQualityScore() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1,
Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff"));
Arrays.asList("30be17df00acc8a92223f51fe7c1bdf7"));
executeTest("test min_base_quality_score 26", spec);
}
@Test
public void testSLOD() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("55760482335497086458b09e415ecf54"));
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("4aa226c00a242047cf427d0919003048"));
executeTest("test SLOD", spec);
}
@ -89,15 +89,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNDA() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("938e888a40182878be4c3cc4859adb69"));
Arrays.asList("17f65eca1e6c1f06919a58f230b6d8d3"));
executeTest("test NDA", spec);
}
@Test
public void testCompTrack() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("7dc186d420487e4e156a24ec8dea0951"));
"-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1,
Arrays.asList("50937942e3d228614d2531c3be237709"));
executeTest("test using comp track", spec);
}
@ -111,17 +111,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testOutputParameterSitesOnly() {
testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9");
testOutputParameters("-sites_only", "48cd40d3994911a6f2609bfd375e1d2d");
}
@Test
public void testOutputParameterAllConfident() {
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "5649f72de04e1391e0f2bb86843d3d72");
testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "28f40ce47651f504158fc4e5bb58df4b");
}
@Test
public void testOutputParameterAllSites() {
testOutputParameters("--output_mode EMIT_ALL_SITES", "cb151bb9e90680b12714d481091ed209");
testOutputParameters("--output_mode EMIT_ALL_SITES", "5259dafaa1b57d9489003b16a48e35f8");
}
private void testOutputParameters(final String args, final String md5) {
@ -135,7 +135,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testConfidence() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1,
Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb"));
Arrays.asList("918109938ef355d759dafc3ebb47d8a5"));
executeTest("test confidence 1", spec1);
}
@ -143,7 +143,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
public void testNoPrior() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 -noPrior", 1,
Arrays.asList("422656266117f8d01e17e5c491c49a24"));
Arrays.asList("7ac60bdc355d97c0939e644b58de47d7"));
executeTest("test no prior 1", spec1);
}
@ -155,12 +155,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
// --------------------------------------------------------------------------------------------------------------
@Test
public void testHeterozyosity1() {
testHeterozosity( 0.01, "ffc1f83a045dc09360e11de7a8efd159" );
testHeterozosity( 0.01, "3b66f82dbb746875638e076bf51a1583" );
}
@Test
public void testHeterozyosity2() {
testHeterozosity( 1.0 / 1850, "5426a98df9f5fd70aef295d889c4e4f1" );
testHeterozosity( 1.0 / 1850, "714c1795334c7c62c046a75479381ae6" );
}
private void testHeterozosity(final double arg, final String md5) {
@ -176,7 +176,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
//
// --------------------------------------------------------------------------------------------------------------
private final static String COMPRESSED_OUTPUT_MD5 = "d5a7326fdcf6d441b73c381912ad3a2a";
private final static String COMPRESSED_OUTPUT_MD5 = "6f79205f7ed8006470f056f6805db6c8";
@Test
public void testCompressedOutput() {
@ -232,7 +232,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -o %s" +
" -L 1:10,000,000-10,100,000",
1,
Arrays.asList("0a4a78da876bfa3d42170249a94357b4"));
Arrays.asList("d995b76adc3766889f7c2c88da14055c"));
executeTest(String.format("test multiple technologies"), spec);
}
@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
" -L 1:10,000,000-10,100,000" +
" -baq CALCULATE_AS_NECESSARY",
1,
Arrays.asList("89182fd4d9532ab4b2a0a84bfb557089"));
Arrays.asList("9669e1643d22d5ad047b3941aeefd6db"));
executeTest(String.format("test calling with BAQ"), spec);
}
@ -281,8 +281,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest {
@Test
public void testNsInCigar() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6"));
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1,
Arrays.asList("2ae3fd39c53a6954d32faed8703adfe8"));
executeTest("test calling on reads with Ns in CIGAR", spec);
}
}

View File

@ -53,7 +53,7 @@ import java.util.Arrays;
public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
private final static String baseCommand = "-T UnifiedGenotyper --disableDithering -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129;
// --------------------------------------------------------------------------------------------------------------
//
@ -64,7 +64,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testMultiSamplePilot1() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1,
Arrays.asList("52b6086f4597da5b35ab902bea4066fc"));
Arrays.asList("e3efd1917192ea743ac1e9958aa0a98f"));
executeTest("test MultiSample Pilot1", spec);
}
@ -72,7 +72,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testWithAllelesPassedIn1() {
WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec(
baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("5b31b811072a4df04524e13604015f9b"));
Arrays.asList("ebfcc3dd8c1788929cb50050c5d456df"));
executeTest("test MultiSample Pilot2 with alleles passed in", spec1);
}
@ -80,7 +80,7 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testWithAllelesPassedIn2() {
WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec(
baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1,
Arrays.asList("d9992e55381afb43742cc9b30fcd7538"));
Arrays.asList("698e54aeae3130779d246b9480a4052c"));
executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2);
}
@ -88,22 +88,22 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
public void testSingleSamplePilot2() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1,
Arrays.asList("33ab66c2f062cfa1f7fcc077165f778c"));
Arrays.asList("aaadb2a355d87344eabb6ac4495a11e4"));
executeTest("test SingleSample Pilot2", spec);
}
@Test
public void testMultipleSNPAlleles() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("28bfbff3da3af43d6a1eff673e5cb0f8"));
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1,
Arrays.asList("09a1a4d4bf0289bcc5e8a958f783a989"));
executeTest("test Multiple SNP alleles", spec);
}
@Test
public void testBadRead() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1,
Arrays.asList("d915535c1458733f09f82670092fcab6"));
executeTest("test bad read", spec);
}
@ -111,16 +111,16 @@ public class UnifiedGenotyperNormalCallingIntegrationTest extends WalkerTest{
@Test
public void testReverseTrim() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("a9edd04374ee9c42970291f39a50c191"));
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1,
Arrays.asList("57a1bb44967988f2b7ae7779127990ae"));
executeTest("test reverse trim", spec);
}
@Test
public void testMismatchedPLs() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("6fc32ca9de769060f3c2a3d94f8f2f91"));
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1,
Arrays.asList("3011c20165951ca43c8a4e86a5835dbd"));
executeTest("test mismatched PLs", spec);
}
}

View File

@ -62,25 +62,25 @@ public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest {
@Test
public void testReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("d55d37e2e86aefb91e47183d2c7dede8"));
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("da3fd775c8add1f7962baabf06b7d372"));
executeTest("test calling on a ReducedRead BAM", spec);
}
@Test
public void testReducedBamSNPs() {
testReducedCalling("SNP", "b424779c6609cb727a675bdd301290e6");
testReducedCalling("SNP", "76244ab1be60814f1412e6cd09e546cc");
}
@Test
public void testReducedBamINDELs() {
testReducedCalling("INDEL", "38c3d14cb9086f7355788d3db9b8ff16");
testReducedCalling("INDEL", "9a986b98ed014576ce923e07452447f4");
}
private void testReducedCalling(final String model, final String md5) {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
"-T UnifiedGenotyper --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1,
Arrays.asList(md5));
executeTest("test calling on a ReducedRead BAM with " + model, spec);
}

View File

@ -57,7 +57,7 @@ import static org.broadinstitute.sting.gatk.walkers.haplotypecaller.HaplotypeCal
public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends WalkerTest {
private void HCTestComplexVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 4";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
}
@ -68,7 +68,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 1";
final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
}
@ -80,7 +80,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa
}
private void HCTestComplexGGA(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " --no_cmdline_in_header -o %s -minPruning 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexGGA: args=" + args, spec);
}

View File

@ -73,7 +73,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCaller: args=" + args, spec);
}
@ -105,7 +105,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
}
@ -120,7 +120,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
final IndexedFastaSequenceFile fasta = new IndexedFastaSequenceFile(new File(b37KGReference));
final GenomeLocParser parser = new GenomeLocParser(fasta.getSequenceDictionary());
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, bam) + " -L 20:10,001,603-10,001,642 -L 20:10,001,653-10,001,742 --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
for( final File vcf : executeTest("testHaplotypeCallerNearbySmallIntervals: args=" + args, spec).getFirst() ) {
if( containsDuplicateRecord(vcf, parser) ) {
@ -158,14 +158,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
// any of the calls in that region because it is so messy.
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("0689d2c202849fd05617648eaf429b9a"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("cb190c935541ebb9f660f713a882b922"));
executeTest("HCTestStructuralIndels: ", spec);
}
@ -173,7 +173,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestDoesNotFailOnBadRefBase() {
// don't care about the output - just want to make sure it doesn't fail
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
final String base = String.format("-T HaplotypeCaller --disableDithering -R %s -I %s", REF, privateTestDir + "NA12878.readsOverBadBase.chr3.bam") + " --no_cmdline_in_header -o /dev/null -L 3:60830000-60840000 --minPruning 3 -stand_call_conf 2 -stand_emit_conf 2";
final WalkerTestSpec spec = new WalkerTestSpec(base, Collections.<String>emptyList());
executeTest("HCTestDoesNotFailOnBadRefBase: ", spec);
}
@ -187,7 +187,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("3c87eb93ffe3a0166aca753050b981e1"));
executeTest("HC calling on a ReducedRead BAM", spec);
}
@ -195,7 +195,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test
public void testReducedBamWithReadsNotFullySpanningDeletion() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
"-T HaplotypeCaller --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1,
Arrays.asList("8adfa8a27a312760dab50787da595c57"));
executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec);
}

View File

@ -211,7 +211,7 @@ public class PerReadAlleleLikelihoodMapUnitTest extends BaseTest {
Assert.assertEquals(perReadAlleleLikelihoodMap.size(),pileup.depthOfCoverage()+10);
Assert.assertEquals(perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap().get(base_A).size(),60);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1,null);
perReadAlleleLikelihoodMap.performPerAlleleDownsampling(0.1);
Assert.assertEquals(perReadAlleleLikelihoodMap.size(),(int) (0.9*(pileup.depthOfCoverage()+10)));
Map<Allele,List<GATKSAMRecord>> downsampledStrat = perReadAlleleLikelihoodMap.getAlleleStratifiedReadMap();

View File

@ -104,8 +104,9 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false;
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
public boolean disableRandomization = false;
@Hidden
@Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests. To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator.")
public boolean disableDithering = false;
@Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false)
public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMReadGroupRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.DefaultHashMap;
import org.broadinstitute.sting.utils.exceptions.StingException;
@ -38,65 +37,62 @@ import org.broadinstitute.variant.variantcontext.Allele;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
import org.apache.log4j.Logger;
public class AlleleBiasedDownsamplingUtils {
// define this class so that we can use Java generics below
private final static class PileupElementList extends ArrayList<PileupElement> {}
/**
* Computes an allele biased version of the given pileup
*
* @param pileup the original pileup
* @param downsamplingFraction the fraction of total reads to remove per allele
* @param log logging output
* @return allele biased pileup
*/
public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
// special case removal of all or no reads
if ( downsamplingFraction <= 0.0 )
return pileup;
if ( downsamplingFraction >= 1.0 )
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
final PileupElementList[] alleleStratifiedElements = new PileupElementList[4];
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
alleleStratifiedElements[i] = new PileupElementList();
// start by stratifying the reads by the alleles they represent at this position
boolean sawReducedRead = false;
for ( final PileupElement pe : pileup ) {
// we do not want to remove a reduced read
if ( !pe.getRead().isReducedRead() ) {
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
if ( pe.getRead().isReducedRead() )
sawReducedRead = true;
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
// make a listing of allele counts
final int[] alleleCounts = new int[4];
for ( int i = 0; i < 4; i++ )
alleleCounts[i] = alleleStratifiedElements[i].size();
// make a listing of allele counts and calculate the total count
final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead);
final int totalAlleleCount = (int)MathUtils.sum(alleleCounts);
// do smart down-sampling
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor
final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove);
final HashSet<PileupElement> readsToRemove = new HashSet<PileupElement>(numReadsToRemove);
for ( int i = 0; i < 4; i++ ) {
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
final PileupElementList alleleList = alleleStratifiedElements[i];
// if we don't need to remove any reads, then don't
if ( alleleList.size() > targetAlleleCounts[i] )
readsToRemove.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log));
if ( alleleCounts[i] > targetAlleleCounts[i] )
readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i]));
}
// clean up pointers so memory can be garbage collected if needed
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i].clear();
// we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(pileup.getNumberOfElements() - numReadsToRemove);
final List<PileupElement> readsToKeep = new ArrayList<PileupElement>(totalAlleleCount - numReadsToRemove);
for ( final PileupElement pe : pileup ) {
if ( !readsToRemove.contains(pe) ) {
readsToKeep.add(pe);
@ -106,6 +102,26 @@ public class AlleleBiasedDownsamplingUtils {
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(readsToKeep));
}
/**
* Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present)
*
* @param alleleStratifiedElements pileup elements stratified by allele
* @param sawReducedRead is at least one read a reduced read?
* @return non-null int array representing allele counts
*/
private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) {
final int[] alleleCounts = new int[alleleStratifiedElements.length];
for ( int i = 0; i < alleleStratifiedElements.length; i++ ) {
if ( !sawReducedRead ) {
alleleCounts[i] = alleleStratifiedElements[i].size();
} else {
for ( final PileupElement pe : alleleStratifiedElements[i] )
alleleCounts[i] += pe.getRepresentativeCount();
}
}
return alleleCounts;
}
private static int scoreAlleleCounts(final int[] alleleCounts) {
if ( alleleCounts.length < 2 )
return 0;
@ -128,11 +144,11 @@ public class AlleleBiasedDownsamplingUtils {
}
/**
* Computes an allele biased version of the given pileup
* Computes an allele biased version of the allele counts for a given pileup
*
* @param alleleCounts the original pileup
* @param numReadsToRemove fraction of total reads to remove per allele
* @return allele biased pileup
* @param alleleCounts the allele counts for the original pileup
* @param numReadsToRemove number of total reads to remove per allele
* @return non-null array of new counts needed per allele
*/
protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) {
final int numAlleles = alleleCounts.length;
@ -169,36 +185,50 @@ public class AlleleBiasedDownsamplingUtils {
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
*
* @param elements original list of records
* @param elements original list of pileup elements
* @param originalElementCount original count of elements (taking reduced reads into account)
* @param numElementsToRemove the number of records to remove
* @param log logging output
* @return the list of pileup elements TO REMOVE
*/
private static <T> List<T> downsampleElements(final List<T> elements, final int numElementsToRemove, final PrintStream log) {
ArrayList<T> elementsToRemove = new ArrayList<T>(numElementsToRemove);
protected static List<PileupElement> downsampleElements(final List<PileupElement> elements, final int originalElementCount, final int numElementsToRemove) {
// are there no elements to remove?
if ( numElementsToRemove == 0 )
return elementsToRemove;
return Collections.<PileupElement>emptyList();
final ArrayList<PileupElement> elementsToRemove = new ArrayList<PileupElement>(numElementsToRemove);
// should we remove all of the elements?
final int pileupSize = elements.size();
if ( numElementsToRemove == pileupSize ) {
logAllElements(elements, log);
if ( numElementsToRemove >= originalElementCount ) {
elementsToRemove.addAll(elements);
return elementsToRemove;
}
// create a bitset describing which elements to remove
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
final BitSet itemsToRemove = new BitSet(originalElementCount);
for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) ) {
final T element = elements.get(i);
logElement(element, log);
int currentBitSetIndex = 0;
for ( final PileupElement element : elements ) {
final int representativeCount = element.getRepresentativeCount();
// if it's a reduced read, we need to be smart about how we down-sample
if ( representativeCount > 1 ) {
// count how many bits are set over the span represented by this read
int setBits = 0;
for ( int i = 0; i < representativeCount; i++ )
setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0;
// remove that count from the count of the reduced read
if ( setBits == representativeCount )
elementsToRemove.add(element);
else
element.adjustRepresentativeCount(-1 * setBits);
}
// otherwise it's trivial: remove if the corresponding bit is set
else if ( itemsToRemove.get(currentBitSetIndex++) ) {
elementsToRemove.add(element);
}
}
@ -211,10 +241,9 @@ public class AlleleBiasedDownsamplingUtils {
*
* @param alleleReadMap original list of records per allele
* @param downsamplingFraction the fraction of total reads to remove per allele
* @param log logging output
* @return list of reads TO REMOVE from allele biased down-sampling
*/
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction) {
int totalReads = 0;
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
totalReads += reads.size();
@ -225,6 +254,8 @@ public class AlleleBiasedDownsamplingUtils {
final List<Allele> alleles = new ArrayList<Allele>(alleleReadMap.keySet());
alleles.remove(Allele.NO_CALL); // ignore the no-call bin
final int numAlleles = alleles.size();
// TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead
final int[] alleleCounts = new int[numAlleles];
for ( int i = 0; i < numAlleles; i++ )
alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size();
@ -234,38 +265,52 @@ public class AlleleBiasedDownsamplingUtils {
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove);
for ( int i = 0; i < numAlleles; i++ ) {
final List<GATKSAMRecord> alleleBin = alleleReadMap.get(alleles.get(i));
if ( alleleBin.size() > targetAlleleCounts[i] ) {
readsToRemove.addAll(downsampleElements(alleleBin, alleleBin.size() - targetAlleleCounts[i], log));
if ( alleleCounts[i] > targetAlleleCounts[i] ) {
readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i]));
}
}
return readsToRemove;
}
private static <T> void logAllElements(final List<T> elements, final PrintStream log) {
if ( log != null ) {
for ( final T obj : elements ) {
logElement(obj, log);
}
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
*
* @param reads original list of records
* @param numElementsToRemove the number of records to remove
* @return the list of pileup elements TO REMOVE
*/
protected static List<GATKSAMRecord> downsampleElements(final List<GATKSAMRecord> reads, final int numElementsToRemove) {
// are there no elements to remove?
if ( numElementsToRemove == 0 )
return Collections.<GATKSAMRecord>emptyList();
final ArrayList<GATKSAMRecord> elementsToRemove = new ArrayList<GATKSAMRecord>(numElementsToRemove);
final int originalElementCount = reads.size();
// should we remove all of the elements?
if ( numElementsToRemove >= originalElementCount ) {
elementsToRemove.addAll(reads);
return elementsToRemove;
}
}
private static <T> void logElement(final T obj, final PrintStream log) {
if ( log != null ) {
final GATKSAMRecord read;
if ( obj instanceof PileupElement )
read = ((PileupElement)obj).getRead();
else
read = (GATKSAMRecord)obj;
final SAMReadGroupRecord readGroup = read.getReadGroup();
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
// create a bitset describing which elements to remove
final BitSet itemsToRemove = new BitSet(originalElementCount);
for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
}
int currentBitSetIndex = 0;
for ( final GATKSAMRecord read : reads ) {
if ( read.isReducedRead() )
throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords");
if ( itemsToRemove.get(currentBitSetIndex++) )
elementsToRemove.add(read);
}
return elementsToRemove;
}
/**
* Create sample-contamination maps from file

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.genotyper;
import com.google.java.contract.Ensures;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -36,7 +35,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.variant.variantcontext.Allele;
import java.io.PrintStream;
import java.util.*;
/**
@ -44,9 +42,8 @@ import java.util.*;
* For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood.
*/
public class PerReadAlleleLikelihoodMap {
private final static Logger logger = Logger.getLogger(PerReadAlleleLikelihoodMap.class);
protected List<Allele> alleles;
protected Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
protected final List<Allele> alleles;
protected final Map<GATKSAMRecord, Map<Allele, Double>> likelihoodReadMap;
public PerReadAlleleLikelihoodMap() {
likelihoodReadMap = new LinkedHashMap<GATKSAMRecord,Map<Allele,Double>>();
@ -78,17 +75,16 @@ public class PerReadAlleleLikelihoodMap {
}
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) {
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction);
}
/**
* For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion
* of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination
* @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed.
* @param log - a PrintStream to log the removed reads to (passed through to the utility function)
*/
public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
public void performPerAlleleDownsampling(final double downsamplingFraction) {
// special case removal of all or no reads
if ( downsamplingFraction <= 0.0 )
return;
@ -101,7 +97,7 @@ public class PerReadAlleleLikelihoodMap {
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = getAlleleStratifiedReadMap();
// compute the reads to remove and actually remove them
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction);
for ( final GATKSAMRecord read : readsToRemove )
likelihoodReadMap.remove(read);
}
@ -117,7 +113,8 @@ public class PerReadAlleleLikelihoodMap {
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
for ( final Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
// do not remove reduced reads!
// TODO -- come up with a strategy for down-sampling reduced reads
// Currently we are unable to remove reduced reads because their representative base count differs throughout the read
if ( !entry.getKey().isReducedRead() ) {
final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue());
if ( bestAllele.isInformative() )

View File

@ -303,7 +303,7 @@ public class PileupElement implements Comparable<PileupElement> {
* this being a reduced read and a deletion, we return the average number of elements between the left
* and right elements to the deletion. We assume the deletion to be left aligned.
*
* @return
* @return the representative count
*/
public int getRepresentativeCount() {
if (read.isReducedRead()) {
@ -318,6 +318,19 @@ public class PileupElement implements Comparable<PileupElement> {
}
}
/**
* Adjusts the representative count of this pileup element.
* Throws an exception if this element does not represent a reduced read.
*
* @param adjustmentFactor how much to adjust the representative count (can be positive or negative)
*/
public void adjustRepresentativeCount(final int adjustmentFactor) {
if ( read.isReducedRead() )
read.adjustReducedCount(offset, adjustmentFactor);
else
throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced");
}
/**
* Get the cigar element aligning this element to the genome
* @return a non-null CigarElement

View File

@ -400,7 +400,40 @@ public class GATKSAMRecord extends BAMRecord {
* @return the number of bases corresponding to the i'th base of the reduced read
*/
public final byte getReducedCount(final int i) {
return getReducedReadCounts()[i];
if ( !isReducedRead() )
throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced");
final byte[] reducedCounts = getReducedReadCounts();
return reducedCounts[i];
}
/**
* Sets the number of bases corresponding the i'th base of the reduced read.
*
* @param i the read based coordinate inside the read
* @param count the new count
*/
public final void setReducedCount(final int i, final byte count) {
if ( count < 0 )
throw new IllegalArgumentException("the reduced count cannot be set to a negative value");
if ( !isReducedRead() )
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
final byte[] reducedCounts = getReducedReadCounts();
reducedCounts[i] = count;
setReducedReadCountsTag(reducedCounts);
}
/**
* Sets the number of bases corresponding the i'th base of the reduced read.
*
* @param i the read based coordinate inside the read
* @param adjustmentFactor how much to add/subtract to the current count
*/
public final void adjustReducedCount(final int i, final int adjustmentFactor) {
if ( !isReducedRead() )
throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced");
final byte[] reducedCounts = getReducedReadCounts();
final byte newCount = (byte)(reducedCounts[i] + adjustmentFactor);
setReducedCount(i, newCount);
}
/**

View File

@ -25,18 +25,21 @@
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.File;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.*;
/**
@ -115,6 +118,51 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest {
return true;
}
@DataProvider(name = "BiasedDownsamplingTest")
public Object[][] makeBiasedDownsamplingTest() {
final List<Object[]> tests = new LinkedList<Object[]>();
final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000);
for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) {
for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) {
for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) {
if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount )
continue;
for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) {
if ( toRemove <= originalNormalCount + originalReducedCount )
tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "BiasedDownsamplingTest")
public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) {
final LinkedList<PileupElement> elements = new LinkedList<PileupElement>();
for ( int i = 0; i < originalNormalCount; i++ ) {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
}
if ( originalReducedCount > 0 ) {
final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1);
read.setReducedReadCountsTag(new byte[]{(byte)originalReducedCount});
elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0));
}
final List<PileupElement> result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove);
int pileupCount = 0;
for ( final PileupElement pe : elements ) // reduced reads may have gotten modified
pileupCount += pe.getRepresentativeCount();
for ( final PileupElement pe : result )
pileupCount -= pe.getRepresentativeCount();
Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove);
}
@Test
public void testLoadContaminationFileDetails(){

View File

@ -39,7 +39,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
final static String BASES = "ACTG";
final static String QUALS = "!+5?";
final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1};
final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets
final private static byte[] REDUCED_READ_COUNTS_OFFSETS = new byte[]{10, 10, 20, 30, -9}; // just the offsets
@BeforeClass
public void init() {
@ -52,7 +52,7 @@ public class GATKSAMRecordUnitTest extends BaseTest {
reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length());
reducedRead.setReadBases(BASES.getBytes());
reducedRead.setBaseQualityString(QUALS);
reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG);
reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS_OFFSETS);
}
@Test
@ -66,6 +66,36 @@ public class GATKSAMRecordUnitTest extends BaseTest {
}
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testGetReducedCountOnNormalRead() {
read.getReducedCount(0);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testSetReducedTagOnNormalRead() {
read.setReducedCount(0, (byte)2);
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testSetReducedCountToNegativeNumber() {
reducedRead.setReducedCount(0, (byte)1000);
}
@Test
public void testSetReducedTagOnReducedRead() {
for (int i = 0; i < reducedRead.getReadLength(); i++) {
final byte newCount = (byte)i;
reducedRead.setReducedCount(i, newCount);
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
}
for (int i = 0; i < reducedRead.getReadLength(); i++) {
final int newCount = reducedRead.getReducedCount(i) + i;
reducedRead.adjustReducedCount(i, i);
Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i);
}
}
@Test
public void testReducedReadPileupElement() {
PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0);