From bb312814a2c074a0e2dcbc3e5261c54e390bd03f Mon Sep 17 00:00:00 2001 From: ebanks Date: Wed, 16 Dec 2009 17:28:09 +0000 Subject: [PATCH] UG is now officially in the business of making good SNP calls (as opposed to being hyper-aggressive in its calls and expecting the end-user to filter). Bad/suspicious bases/reads (high mismatch rate, low MQ, low BQ, bad mates) are now filtered out by default (and not used for the annotations either), although this can all be turned off. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2373 348d0f76-0448-11de-a6fe-93d51630548a --- .../contexts/StratifiedAlignmentContext.java | 30 ++++++------- .../gatk/walkers/annotator/AlleleBalance.java | 2 +- .../walkers/annotator/DepthOfCoverage.java | 2 +- .../gatk/walkers/annotator/MismatchRate.java | 35 --------------- .../gatk/walkers/annotator/RankSumTest.java | 2 +- .../walkers/annotator/SecondBaseSkew.java | 2 +- .../walkers/annotator/SpanningDeletions.java | 2 +- .../walkers/annotator/VariantAnnotator.java | 2 +- .../DiploidGenotypeCalculationModel.java | 2 +- .../genotyper/EMGenotypeCalculationModel.java | 6 +-- ...JointEstimateGenotypeCalculationModel.java | 6 +-- ...PointEstimateGenotypeCalculationModel.java | 2 +- .../genotyper/UnifiedArgumentCollection.java | 14 ++++-- .../walkers/genotyper/UnifiedGenotyper.java | 43 +++++++++++++------ .../sting/utils/AlignmentUtils.java | 24 ++++------- .../utils/genotype/vcf/VCFGenotypeRecord.java | 2 +- .../SecondBaseSkewIntegrationTest.java | 2 +- .../VariantAnnotatorIntegrationTest.java | 8 ++-- .../UnifiedGenotyperIntegrationTest.java | 38 ++++++++-------- 19 files changed, 99 insertions(+), 125 deletions(-) delete mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java index cc51a3dbf..1f060ba7d 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java @@ -44,11 +44,10 @@ public class StratifiedAlignmentContext { // Definitions: // COMPLETE = full alignment context - // MQ0FREE = full context without MQ0 reads - // FORWARD = reads on forward strand (*no* MQ0 reads) - // REVERSE = reads on forward strand (*no* MQ0 reads) + // FORWARD = reads on forward strand + // REVERSE = reads on forward strand // - public enum StratifiedContextType { COMPLETE, MQ0FREE, FORWARD, REVERSE } + public enum StratifiedContextType { COMPLETE, FORWARD, REVERSE } private GenomeLoc loc; private AlignmentContext[] contexts = new AlignmentContext[StratifiedContextType.values().length]; @@ -72,16 +71,12 @@ public class StratifiedAlignmentContext { } public void add(SAMRecord read, int offset) { - if ( read.getMappingQuality() > 0 ) { - reads[StratifiedContextType.MQ0FREE.ordinal()].add(read); - offsets[StratifiedContextType.MQ0FREE.ordinal()].add(offset); - if ( read.getReadNegativeStrandFlag() ) { - reads[StratifiedContextType.REVERSE.ordinal()].add(read); - offsets[StratifiedContextType.REVERSE.ordinal()].add(offset); - } else { - reads[StratifiedContextType.FORWARD.ordinal()].add(read); - offsets[StratifiedContextType.FORWARD.ordinal()].add(offset); - } + if ( read.getReadNegativeStrandFlag() ) { + reads[StratifiedContextType.REVERSE.ordinal()].add(read); + offsets[StratifiedContextType.REVERSE.ordinal()].add(offset); + } else { + reads[StratifiedContextType.FORWARD.ordinal()].add(read); + offsets[StratifiedContextType.FORWARD.ordinal()].add(offset); } reads[StratifiedContextType.COMPLETE.ordinal()].add(read); offsets[StratifiedContextType.COMPLETE.ordinal()].add(offset); @@ -90,18 +85,17 @@ public class StratifiedAlignmentContext { /** * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample. * - * @param context the original AlignmentContext + * @param pileup the original pileup * @param assumedSingleSample if not null, any read without a readgroup will be given this sample name * @param collapseToThisSample if not null, all reads will be assigned this read group regardless of their actual read group * * @return a Map of sample name to StratifiedAlignmentContext * **/ - public static Map splitContextBySample(AlignmentContext context, String assumedSingleSample, String collapseToThisSample) { + public static Map splitContextBySample(ReadBackedPileup pileup, String assumedSingleSample, String collapseToThisSample) { HashMap contexts = new HashMap(); - ReadBackedPileup pileup = context.getPileup(); for (PileupElement p : pileup ) { // get the read @@ -125,7 +119,7 @@ public class StratifiedAlignmentContext { // create a new context object if this is the first time we're seeing a read for this sample StratifiedAlignmentContext myContext = contexts.get(sample); if ( myContext == null ) { - myContext = new StratifiedAlignmentContext(context.getLocation()); + myContext = new StratifiedAlignmentContext(pileup.getLocation()); contexts.put(sample, myContext); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index b9c4feba0..25e38e078 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -39,7 +39,7 @@ public class AlleleBalance extends StandardVariantAnnotation { if ( genotypeStr.length() != 2 ) return null; - final String bases = new String(context.getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup().getBases()).toUpperCase(); + final String bases = new String(context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup().getBases()).toUpperCase(); if ( bases.length() == 0 ) return null; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 35c696b2b..7868c01ef 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -20,5 +20,5 @@ public class DepthOfCoverage extends StandardVariantAnnotation { public String getKeyName() { return VCFRecord.DEPTH_KEY; } - public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine(getKeyName(), 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Total Depth (including MQ0 reads)"); } + public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine(getKeyName(), 1, VCFInfoHeaderLine.INFO_TYPE.Integer, "Total Depth"); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java deleted file mode 100755 index f6e99e3a8..000000000 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MismatchRate.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.contexts.StratifiedAlignmentContext; -import org.broadinstitute.sting.utils.pileup.*; -import org.broadinstitute.sting.utils.genotype.Variation; -import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.*; - -import java.util.Map; - - -public class MismatchRate implements VariantAnnotation { - - public String annotate(ReferenceContext ref, Map stratifiedContexts, Variation variation) { - int mismatches = 0; - int totalBases = 0; - for ( String sample : stratifiedContexts.keySet() ) { - ReadBackedPileup pileup = stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup(); - Pair counts = AlignmentUtils.mismatchesInRefWindow(pileup, ref, true); - mismatches += counts.first; - totalBases += counts.second; - } - - // sanity check - if ( totalBases == 0 ) - return null; - - return String.format("%.2f", (double)mismatches/(double)totalBases); - } - - public String getKeyName() { return "MR"; } - - public VCFInfoHeaderLine getDescription() { return new VCFInfoHeaderLine("MR", 1, VCFInfoHeaderLine.INFO_TYPE.Float, "Mismatch Rate of Reads Spanning This Position"); } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index d9894e7af..b990573df 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -36,7 +36,7 @@ public class RankSumTest implements VariantAnnotation { if ( context == null ) continue; - fillQualsFromPileup(ref.getBase(), variation.getAlternativeBaseForSNP(), context.getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup(), refQuals, altQuals); + fillQualsFromPileup(ref.getBase(), variation.getAlternativeBaseForSNP(), context.getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(), refQuals, altQuals); } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkew.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkew.java index 4cf84fce9..b81c0d6f3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkew.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkew.java @@ -37,7 +37,7 @@ public class SecondBaseSkew implements VariantAnnotation { Pair depth = new Pair(0, 0); for ( String sample : stratifiedContexts.keySet() ) { - Pair sampleDepth = getSecondaryPileupNonrefCount(ref.getBase(), stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup(), snp); + Pair sampleDepth = getSecondaryPileupNonrefCount(ref.getBase(), stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(), snp); depth.first += sampleDepth.first; depth.second += sampleDepth.second; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index d9a91b637..5ba5a95e8 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -15,7 +15,7 @@ public class SpanningDeletions extends StandardVariantAnnotation { int deletions = 0; int depth = 0; for ( String sample : stratifiedContexts.keySet() ) { - ReadBackedPileup pileup = stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup(); + ReadBackedPileup pileup = stratifiedContexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(); deletions += pileup.getNumberOfDeletions(); depth += pileup.size(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index 7a741bc6e..198ed84a3 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -165,7 +165,7 @@ public class VariantAnnotator extends RodWalker { if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 && variant.isBiallelic() && variant.isSNP() ) { - Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context, null, null); + Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context.getPileup(), null, null); if ( stratifiedContexts != null ) annotations = getAnnotations(ref, stratifiedContexts, variant, requestedAnnotations); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java index 650d77049..1a1b205e1 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeCalculationModel.java @@ -98,7 +98,7 @@ public class DiploidGenotypeCalculationModel extends JointEstimateGenotypeCalcul if ( call instanceof ReadBacked ) { - ReadBackedPileup pileup = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup(); + ReadBackedPileup pileup = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(); ((ReadBacked)call).setPileup(pileup); } if ( call instanceof SampleBacked ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java index 3b4bca690..15f6f640d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/EMGenotypeCalculationModel.java @@ -22,7 +22,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode public Pair> calculateGenotype(RefMetaDataTracker tracker, char ref, GenomeLoc loc, Map contexts, DiploidGenotypePriors priors) { // run the EM calculation - EMOutput overall = runEM(ref, contexts, priors, StratifiedAlignmentContext.StratifiedContextType.MQ0FREE); + EMOutput overall = runEM(ref, contexts, priors, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); double PofD = Math.pow(10, overall.getPofD()); double PofNull = Math.pow(10, overall.getPofNull()); @@ -93,7 +93,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode for ( String sample : GLs.keySet() ) { // create the call - AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE); + AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); GenotypeCall call = GenotypeWriterFactory.createSupportedGenotypeCall(OUTPUT_FORMAT, ref, context.getLocation()); // set the genotype and confidence @@ -105,7 +105,7 @@ public abstract class EMGenotypeCalculationModel extends GenotypeCalculationMode call.setGenotype(bestGenotype); if ( call instanceof ReadBacked ) { - ReadBackedPileup pileup = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE).getPileup(); + ReadBackedPileup pileup = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE).getPileup(); ((ReadBacked)call).setPileup(pileup); } if ( call instanceof SampleBacked ) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java index dd44cbff0..a17ac618c 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/JointEstimateGenotypeCalculationModel.java @@ -55,8 +55,8 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc initializeAlleleFrequencies(frequencyEstimationPoints); - initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.MQ0FREE); - calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.MQ0FREE); + initialize(ref, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); + calculateAlleleFrequencyPosteriors(ref, frequencyEstimationPoints, contexts, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); calculatePofFs(ref, frequencyEstimationPoints); // print out stats if we have a writer @@ -74,7 +74,7 @@ public abstract class JointEstimateGenotypeCalculationModel extends GenotypeCalc int[] qualCounts = new int[4]; for ( String sample : contexts.keySet() ) { - AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.MQ0FREE); + AlignmentContext context = contexts.get(sample).getContext(StratifiedAlignmentContext.StratifiedContextType.COMPLETE); // calculate the sum of quality scores for each base ReadBackedPileup pileup = context.getPileup(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java index 39b486bf7..fc19e16a5 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PointEstimateGenotypeCalculationModel.java @@ -40,7 +40,7 @@ public class PointEstimateGenotypeCalculationModel extends EMGenotypeCalculation return null; // get the genotype likelihoods - Pair discoveryGL = getSingleSampleLikelihoods(sampleContext, priors, StratifiedAlignmentContext.StratifiedContextType.MQ0FREE); + Pair discoveryGL = getSingleSampleLikelihoods(sampleContext, priors, StratifiedAlignmentContext.StratifiedContextType.COMPLETE); // find the index of the best genotype double[] normPosteriors = discoveryGL.second.getNormalizedPosteriors(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index f4a7001d3..8a370cf21 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -88,9 +88,15 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_allele_frequency", shortName = "min_freq", doc = "The minimum possible allele frequency in a population (advanced)", required = false) public double MINIMUM_ALLELE_FREQUENCY = 1e-8; - @Argument(fullName = "minBaseQualityScore", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) - public Integer MIN_BASE_QUALTY_SCORE = -1; + @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) + public int MIN_BASE_QUALTY_SCORE = 20; - @Argument(fullName = "minMappingQualityScore", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) - public Integer MIN_MAPPING_QUALTY_SCORE = -1; + @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling (MQ0 reads are ignored regardless)", required = false) + public int MIN_MAPPING_QUALTY_SCORE = 30; + + @Argument(fullName = "max_mismatches_in_40bp_window", shortName = "mm40", doc = "Maximum number of mismatches within a 40 bp window (20bp on either side) around the target position for a read to be used for calling", required = false) + public int MAX_MISMATCHES = 3; + + @Argument(fullName = "use_reads_with_bad_mates", shortName = "bad_mates", doc = "Use reads whose mates are mapped excessively far away for calling", required = false) + public boolean USE_BADLY_MATED_READS = false; } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index da3bae24b..88ed9cf74 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -26,19 +26,16 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.contexts.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotator; -import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.*; import org.broadinstitute.sting.utils.cmdLine.*; import org.broadinstitute.sting.utils.genotype.*; -import org.broadinstitute.sting.utils.genotype.vcf.VCFGenotypeRecord; -import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderLine; -import org.broadinstitute.sting.utils.genotype.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.genotype.vcf.*; import net.sf.samtools.SAMReadGroupRecord; @@ -51,6 +48,7 @@ import java.util.*; * multi-sample, and pooled data. The user can choose from several different incorporated calculation models. */ @Reference(window=@Window(start=-20,stop=20)) +@ReadFilters({ZeroMappingQualityReadFilter.class}) public class UnifiedGenotyper extends LocusWalker>, Integer> { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -174,6 +172,7 @@ public class UnifiedGenotyper extends LocusWalker 0 && rawContext.getPileup().size() > UAC.MAX_READS_IN_PILEUP) ) + if ( pileup.size() == 0 || + (UAC.MAX_READS_IN_PILEUP > 0 && pileup.size() > UAC.MAX_READS_IN_PILEUP) ) return null; // are there too many deletions in the pileup? - ReadBackedPileup pileup = rawContext.getPileup().getBaseAndMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE, UAC.MIN_MAPPING_QUALTY_SCORE); - AlignmentContext context = new AlignmentContext(rawContext.getLocation(), pileup); if ( isValidDeletionFraction(UAC.MAX_DELETION_FRACTION) && - (double)pileup.getPileupWithoutMappingQualityZeroReads().getNumberOfDeletions() / (double)(pileup.size() - pileup.getNumberOfMappingQualityZeroReads()) > UAC.MAX_DELETION_FRACTION ) + (double)pileup.getNumberOfDeletions() / (double)pileup.size() > UAC.MAX_DELETION_FRACTION ) return null; // stratify the AlignmentContext and cut by sample // Note that for testing purposes, we may want to throw multi-samples at pooled mode - Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(context, UAC.ASSUME_SINGLE_SAMPLE, (UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ? PooledCalculationModel.POOL_SAMPLE_NAME : null)); + Map stratifiedContexts = StratifiedAlignmentContext.splitContextBySample(pileup, UAC.ASSUME_SINGLE_SAMPLE, (UAC.genotypeModel == GenotypeCalculationModel.Model.POOLED ? PooledCalculationModel.POOL_SAMPLE_NAME : null)); if ( stratifiedContexts == null ) return null; DiploidGenotypePriors priors = new DiploidGenotypePriors(ref, UAC.heterozygosity, DiploidGenotypePriors.PROB_OF_TRISTATE_GENOTYPE); - Pair> call = gcm.calculateGenotype(tracker, ref, context.getLocation(), stratifiedContexts, priors); + Pair> call = gcm.calculateGenotype(tracker, ref, rawContext.getLocation(), stratifiedContexts, priors); // annotate the call, if possible if ( call != null && call.first != null && call.first instanceof ArbitraryFieldsBacked ) { @@ -237,6 +240,18 @@ public class UnifiedGenotyper extends LocusWalker filteredPileup = new ArrayList(); + for ( PileupElement p : pileup ) { + if ( (useBadMates || !p.getRead().getReadPairedFlag() || p.getRead().getMateUnmappedFlag() || p.getRead().getMateReferenceIndex() == p.getRead().getReferenceIndex()) && + AlignmentUtils.mismatchesInRefWindow(p, refContext, true) <= maxMismatches ) + filteredPileup.add(p); + } + return new ReadBackedPileup(pileup.getLocation(), filteredPileup); + + } + private static boolean isValidDeletionFraction(double d) { return ( d >= 0.0 && d <= 1.0 ); } diff --git a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java index 287afc5d7..c9c4e9ffd 100644 --- a/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/AlignmentUtils.java @@ -118,7 +118,7 @@ public class AlignmentUtils { * @param refSeq chunk of reference sequence that subsumes the alignment completely (if alignment runs out of * the reference string, IndexOutOfBound exception will be thrown at runtime). * @param refIndex zero-based position, at which the alignment starts on the specified reference string. - * @return + * @return the number of mismatches */ public static int numMismatches(SAMRecord r, String refSeq, int refIndex) { int readIdx = 0; @@ -160,15 +160,12 @@ public class AlignmentUtils { * @param pileup the pileup with reads * @param ref the reference context * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return a pair where the first value is the mismatches and the second is the total bases within the context + * @return the number of mismatches */ - public static Pair mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { - Pair mismatches = new Pair(0, 0); - for ( PileupElement p : pileup ) { - Pair mm = mismatchesInRefWindow(p, ref, ignoreTargetSite); - mismatches.first += mm.first; - mismatches.second += mm.second; - } + public static int mismatchesInRefWindow(ReadBackedPileup pileup, ReferenceContext ref, boolean ignoreTargetSite) { + int mismatches = 0; + for ( PileupElement p : pileup ) + mismatches += mismatchesInRefWindow(p, ref, ignoreTargetSite); return mismatches; } @@ -177,12 +174,11 @@ public class AlignmentUtils { * @param p the pileup element * @param ref the reference context * @param ignoreTargetSite if true, ignore mismatches at the target locus (i.e. the center of the window) - * @return a pair where the first value is the mismatches and the second is the total bases within the context + * @return the number of mismatches */ - public static Pair mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { + public static int mismatchesInRefWindow(PileupElement p, ReferenceContext ref, boolean ignoreTargetSite) { int mismatches = 0; - int totalBases = 0; GenomeLoc window = ref.getWindow(); char[] refBases = ref.getBases(); @@ -212,7 +208,6 @@ public class AlignmentUtils { if ( ignoreTargetSite && ref.getLocus().getStart() == currentPos ) continue; - totalBases++; char readChr = (char)readBases[readIndex]; if ( Character.toUpperCase(readChr) != Character.toUpperCase(refChr) ) mismatches++; @@ -234,8 +229,7 @@ public class AlignmentUtils { } - //System.out.println("There are " + mismatches + " mismatches out of " + totalBases + " bases for read " + p.getRead().getReadName()); - return new Pair(mismatches, totalBases); + return mismatches; } /** Returns number of alignment blocks (continuous stretches of aligned bases) in the specified alignment. diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java index 447f52951..19b00a8db 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFGenotypeRecord.java @@ -266,7 +266,7 @@ public class VCFGenotypeRecord implements Genotype, SampleBacked { Set result = new HashSet(); result.add(new VCFFormatHeaderLine(GENOTYPE_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.String, "Genotype")); result.add(new VCFFormatHeaderLine(GENOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Genotype Quality")); - result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (without MQ0 reads)")); + result.add(new VCFFormatHeaderLine(DEPTH_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Read Depth (only filtered reads used for calling)")); //result.add(new VCFFormatHeaderLine(HAPLOTYPE_QUALITY_KEY, 1, VCFFormatHeaderLine.INFO_TYPE.Integer, "Haplotype Quality")); return result; } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkewIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkewIntegrationTest.java index accfd5d4d..38731d918 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkewIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/SecondBaseSkewIntegrationTest.java @@ -33,7 +33,7 @@ public class SecondBaseSkewIntegrationTest extends WalkerTest { +"-B variant,Variants,/humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_pilot_pool3_raw_calls.geli " +"-vcf %s -sample variant -L /humgen/gsa-scr1/GATK_Data/Validation_Data/FHS_test_intervals.interval_list"; - String md5_for_this_test = "f7e67c353d3113447d1b9c8c39de6ed0"; + String md5_for_this_test = "4bd8a28bcbad107b102fc796918d5932"; WalkerTestSpec spec = new WalkerTestSpec(test_args,1, Arrays.asList(md5_for_this_test)); executeTest("Testing on E2 annotated but not Q2 annotated file ",spec); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 90284a1c9..ecb342d99 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("1225d977f3e83d362141c1bdb4730016")); + Arrays.asList("d9f623b69e20b9c6289562f123b867eb")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("7234adba82893ebf63338ab20ecc610d")); + Arrays.asList("d0e70ee36ed1a59b5f086bc30c9b2673")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample2empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("05d7f1c30130ce13d355200be8c2b31d")); + Arrays.asList("23928a3f79fd5d841505cdeaf342c8dc")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -standard -B variant,VCF,/humgen/gsa-scr1/GATK_Data/Validation_Data/vcfexample3empty.vcf -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("643fa948531ebbfa013e9eeced76430b")); + Arrays.asList("2b47c7a6a7f0ce15fd1d1dd02ecab73b")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 4672c88da..1ff579642 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -20,9 +20,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { return baseTestString() + " --variant_output_format GELI -confidence 50"; } - private static String OneMb1StateMD5 = "7e3fc1d8427329eb2a3e05a81011749a"; - private static String OneMb3StateMD5 = "f5912d5d6585436a77495688f09cf1cc"; - private static String OneMbEmpiricalMD5 = "b9b2d9c7eb9a7af416fddd3b77a72efe"; + private static String OneMb1StateMD5 = "c664bd887b89e9ebc0b4b569ad8eb128"; + private static String OneMb3StateMD5 = "9c68f6e900d081023ea97ec467a95bd8"; + private static String OneMbEmpiricalMD5 = "0b891eefeb2a2bfe707a8a0838b6d049"; // private static String oneMbMD5(BaseMismatchModel m) { // switch (m) { @@ -47,7 +47,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,400-10,024,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, - Arrays.asList("ad00b255c83fe43213758b280646e4b4")); + Arrays.asList("b19f85fddd08485c668e7192df79d944")); executeTest("testMultiSamplePilot1 - Point Estimate EM", spec); } @@ -55,7 +55,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2PointEM() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,010,000 -bm empirical -gm EM_POINT_ESTIMATE -confidence 30", 1, - Arrays.asList("e709714e288c508f79c1ffc5cdbe9cca")); + Arrays.asList("a6c8e77d1741f3f6d958a0634fa59e14")); executeTest("testMultiSamplePilot2 - Point Estimate EM", spec); } @@ -68,7 +68,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testPooled1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,023,000-10,024,000 -bm empirical -gm POOLED -ps 60 -confidence 30", 1, - Arrays.asList("2214983d4ae3a19bc16e7cb89fc3128f")); + Arrays.asList("459b8f6a6265b67718495e9bffe96fa8")); executeTest("testPooled1", spec); } @@ -81,7 +81,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/low_coverage_CEU.chr1.10k-11k.bam -varout %s -L 1:10,022,000-10,025,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("9e98eba56add9989e2d40834efc8c803")); + Arrays.asList("c132c93cec5fdf02c3235180a7aa7dcc")); executeTest("testMultiSamplePilot1 - Joint Estimate", spec); } @@ -89,7 +89,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/pilot2_daughters.chr20.10k-11k.bam -varout %s -L 20:10,000,000-10,050,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("fe1077636feb41c69f7c4f64147e2bba")); + Arrays.asList("09d48c56eae25fb6418e10feeb5b3fc5")); executeTest("testMultiSamplePilot2 - Joint Estimate", spec); } @@ -97,7 +97,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2Joint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,100,000 -bm empirical -gm JOINT_ESTIMATE -confidence 30", 1, - Arrays.asList("6a739a0acb620aeed5d8e0de7b59877a")); + Arrays.asList("3544b1eb97834e2050239c101eaddb2d")); executeTest("testSingleSamplePilot2 - Joint Estimate", spec); } @@ -105,7 +105,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testGenotypeModeJoint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -genotype -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1, - Arrays.asList("85fbcdd816ecc1d9686df244a6e039dd")); + Arrays.asList("7e8422f8008a4fe24ea1f5c2913b5d31")); executeTest("testGenotypeMode - Joint Estimate", spec); } @@ -113,7 +113,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testAllBasesModeJoint() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -all_bases -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -varout %s -L 1:10,000,000-10,001,000 -bm empirical -gm JOINT_ESTIMATE -confidence 70", 1, - Arrays.asList("f871576cf49e7a83a0f6b5580b0178c2")); + Arrays.asList("aedd59f9f2cae7fb07a53812b680852b")); executeTest("testAllBasesMode - Joint Estimate", spec); } @@ -141,7 +141,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -bm empirical" + " -vf GELI", 1, - Arrays.asList("46acd5a5d7301129ee3c0c9a4ab10259")); + Arrays.asList("eaca4b2323714dbd7c3ed379ce1843ba")); executeTest(String.format("testMultiTechnologies"), spec); } @@ -178,7 +178,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void genotypeTest() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( testGeliLod5() + " -L 1:10,000,000-10,100,000 -bm empirical --genotype", 1, - Arrays.asList("db34a348abd13ee894780296e8a7e909")); + Arrays.asList("f9bdd9a8864467dbc4e5356bb8801a33")); executeTest("genotypeTest", spec); } @@ -237,8 +237,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testLOD() { HashMap e = new HashMap(); - e.put( 100.0, "94c5b48c0c956fcdacbffaa38a80d926" ); - e.put( 30.0, "df455abc1b2bc533aa1dc6eb088a835a" ); + e.put( 100.0, "6eec841b28fae433015b3d85608e03f7" ); + e.put( 30.0, "1b3365f41bbf6867516699afe9efc5f8" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -256,9 +256,9 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "b8837be7e8beb3ab2ed7150cdc022c65" ); - e.put( 0.0001, "ef0f2af7d13f166829d86b15fabc2b81" ); - e.put( 1.0 / 1850, "a435c8c966c11f4393a25a9d01c4fc3d" ); + e.put( 0.01, "601c48fc350083d14534ba5c3093edb9" ); + e.put( 0.0001, "bd03f7307314e45951d4d3e85fe63d16" ); + e.put( 1.0 / 1850, "662d479f1cd54480da1d0e66c81259b0" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void empirical1MbTestBinaryGeli() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseTestString() + " -L 1:10,000,000-11,000,000 -bm empirical --variant_output_format GELI_BINARY -confidence 50", 1, - Arrays.asList("18f175c7ccaeca57b8d412e9f4ebbe50")); + Arrays.asList("b1027cf309c9ab7572528ce986e2c2d4")); executeTest("empirical1MbTestBinaryGeli", spec); } }