From d26183e0ecde460b72dc432b761f52dd4cadb5f4 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 16 Aug 2012 20:36:53 -0400 Subject: [PATCH 01/32] First preliminary big refactoring of UG annotation engine. Goals: a) Remove gigantic hack that cached per-read haplotype likelihoods in a static array so that annotations would go back and retrieve them, b) unify interface for annotations between HaplotypeCaller and UnifiedGenotyper, c) as a consequence, removed and cleaned duplicated code. As a bonus, annotations have now more relevant info to help them compute values. Major idea is that per-read haplotype likelihoods are now stored in a single unified object of class PerReadAlleleLikelihoodMap. Class implementation in theory hides internal storage details from outside work (still may need work cleaning up interface), and this object(or rather, a Map from Sample->perReadAlleleLikelihoodMap) is produced by UGCalcLikelihoods. The genotype calculation is also able to potentially use this info if needed. All InfoFieldAnnotations now get an extra argument with this map. Currently, this map is only produced for indels in UG, or for all variants within HaplotypeCaller. If this map is absent (SNPs in UG), the old Pileup interface is used, but it's avoided whenever possible. FORMAT annotations are not yet changed but will be focus of second step. Major benefit will be that annotations will be able to very easily discard non-informative reads for certain events. HaplotypeCaller also uses this new class, and no longer hard-codes the mapping of allele ->list(reads) but instead uses the same objects and interfaces as the rest of the modules. Code still needs further testing/cleaning/reviewing/debugging --- ...dyGenotypeLikelihoodsCalculationModel.java | 28 ++-- ...GeneralPloidyIndelGenotypeLikelihoods.java | 8 +- ...elGenotypeLikelihoodsCalculationModel.java | 6 +- .../haplotypecaller/HaplotypeCaller.java | 7 +- .../LikelihoodCalculationEngine.java | 47 ++----- .../SimpleDeBruijnAssembler.java | 3 +- .../gatk/walkers/annotator/AlleleBalance.java | 8 +- .../gatk/walkers/annotator/BaseCounts.java | 8 +- .../annotator/BaseQualityRankSumTest.java | 80 ++++------- .../walkers/annotator/ChromosomeCounts.java | 15 +- .../annotator/ClippingRankSumTest.java | 79 +++-------- .../walkers/annotator/DepthOfCoverage.java | 39 +++--- .../annotator/DepthPerAlleleBySample.java | 8 +- .../gatk/walkers/annotator/FisherStrand.java | 129 ++++-------------- .../gatk/walkers/annotator/GCContent.java | 8 +- .../walkers/annotator/HaplotypeScore.java | 46 ++++--- .../gatk/walkers/annotator/HardyWeinberg.java | 8 +- .../walkers/annotator/HomopolymerRun.java | 8 +- .../walkers/annotator/InbreedingCoeff.java | 14 +- .../gatk/walkers/annotator/IndelType.java | 10 +- .../sting/gatk/walkers/annotator/LowMQ.java | 8 +- .../walkers/annotator/MVLikelihoodRatio.java | 8 +- .../annotator/MappingQualityRankSumTest.java | 83 +++++------ .../walkers/annotator/MappingQualityZero.java | 8 +- .../annotator/MappingQualityZeroFraction.java | 8 +- .../gatk/walkers/annotator/NBaseCount.java | 8 +- .../gatk/walkers/annotator/QualByDepth.java | 66 ++++----- .../walkers/annotator/RMSMappingQuality.java | 87 ++++++------ .../gatk/walkers/annotator/RankSumTest.java | 105 ++++---------- .../walkers/annotator/ReadPosRankSumTest.java | 124 ++++++----------- .../gatk/walkers/annotator/SampleList.java | 8 +- .../sting/gatk/walkers/annotator/SnpEff.java | 8 +- .../walkers/annotator/SpanningDeletions.java | 8 +- .../annotator/TandemRepeatAnnotator.java | 8 +- .../annotator/TechnologyComposition.java | 8 +- .../TransmissionDisequilibriumTest.java | 8 +- .../annotator/VariantAnnotatorEngine.java | 18 ++- .../ActiveRegionBasedAnnotation.java | 5 +- .../interfaces/InfoFieldAnnotation.java | 22 ++- .../GenotypeLikelihoodsCalculationModel.java | 4 +- ...elGenotypeLikelihoodsCalculationModel.java | 30 ++-- .../genotyper/PerReadAlleleLikelihoodMap.java | 128 +++++++++++++++++ ...NPGenotypeLikelihoodsCalculationModel.java | 5 +- .../genotyper/UnifiedGenotyperEngine.java | 75 +++++++--- .../indels/PairHMMIndelErrorModel.java | 78 ++++++----- .../broadinstitute/sting/utils/MathUtils.java | 22 +++ 46 files changed, 775 insertions(+), 724 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index f6ce818be..4c20700ac 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -41,15 +41,6 @@ import java.util.*; public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { - //protected Set laneIDs; - public enum Model { - SNP, - INDEL, - POOLSNP, - POOLINDEL, - BOTH - } - final protected UnifiedArgumentCollection UAC; protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { @@ -203,7 +194,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { HashMap perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts); if (perLaneErrorModels == null && UAC.referenceSampleName != null) @@ -215,8 +207,11 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G newContext.put(DUMMY_SAMPLE_NAME,mergedContext); contexts = newContext; } - - // get initial alleles to genotype + if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { + // starting a new site: clear allele list + perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods + } + // get initial alleles to genotype final List allAlleles = new ArrayList(); if (allAllelesToUse == null || allAllelesToUse.isEmpty()) allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse)); @@ -234,9 +229,13 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G continue; ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ + // no likelihoods have been computed for this sample at this site + perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap()); + } // create the GenotypeLikelihoods object - final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO); + final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO, perReadAlleleLikelihoodMap.get(sample.getKey())); // actually compute likelihoods final int nGoodBases = GL.add(pileup, UAC); if ( nGoodBases > 0 ) @@ -333,7 +332,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, - final boolean ignoreLaneInformation); + final boolean ignoreLaneInformation, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap); protected abstract List getInitialAllelesToUse(final RefMetaDataTracker tracker, final ReferenceContext ref, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index 4f42f820e..e562bd265 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -26,6 +26,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype double[][] readHaplotypeLikelihoods; final byte refBase; + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap; public GeneralPloidyIndelGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, @@ -34,7 +35,8 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype final boolean ignoreLaneInformation, final PairHMMIndelErrorModel pairModel, final LinkedHashMap haplotypeMap, - final ReferenceContext referenceContext) { + final ReferenceContext referenceContext, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); this.pairModel = pairModel; this.haplotypeMap = haplotypeMap; @@ -42,6 +44,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles); // todo - not needed if indel alleles have base at current position this.refBase = referenceContext.getBase(); + this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap; } // ------------------------------------------------------------------------------------- @@ -142,10 +145,9 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype List numSeenBases = new ArrayList(this.alleles.size()); if (!hasReferenceSampleData) { - final int numHaplotypes = haplotypeMap.size(); final int readCounts[] = new int[pileup.getNumberOfElements()]; - readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts); + readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); n = readHaplotypeLikelihoods.length; } else { Allele refAllele = null; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index f6559f666..fc0c526bc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -73,8 +73,9 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, - final boolean ignoreLaneInformation){ - return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref); + final boolean ignoreLaneInformation, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ + return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref, perReadAlleleLikelihoodMap); } protected List getInitialAllelesToUse(final RefMetaDataTracker tracker, @@ -90,7 +91,6 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE) alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE); if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { - IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear(); haplotypeMap.clear(); } IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index ec4fb3950..cde76ca7a 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.walkers.genotyper.*; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.commandline.*; @@ -44,10 +45,6 @@ import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -417,7 +414,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } - final Map>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); + final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); // add some custom annotations to the calls diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index fabf5633f..9ba434100 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -323,11 +324,13 @@ public class LikelihoodCalculationEngine { return bestHaplotypes; } - public static Map>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) { - final Map>> returnMap = new HashMap>>(); + public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) { + final Map returnMap = new HashMap(); final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); for( final Map.Entry> sample : perSampleReadList.entrySet() ) { - final Map> alleleReadMap = new HashMap>(); + //final Map> alleleReadMap = new HashMap>(); + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + final ArrayList readsForThisSample = sample.getValue(); for( int iii = 0; iii < readsForThisSample.size(); iii++ ) { final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same! @@ -335,51 +338,31 @@ public class LikelihoodCalculationEngine { if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { final double likelihoods[] = new double[call.getFirst().getAlleles().size()]; int count = 0; - for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood - double maxLikelihood = Double.NEGATIVE_INFINITY; + + for( final Allele a : call.getFirst().getAlleles() ) { for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object) final double likelihood = h.getReadLikelihoods(sample.getKey())[iii]; - if( likelihood > maxLikelihood ) { - maxLikelihood = likelihood; - } - } - likelihoods[count++] = maxLikelihood; - } - final int bestAllele = MathUtils.maxElementIndex(likelihoods); - final double bestLikelihood = likelihoods[bestAllele]; - Allele allele = Allele.NO_CALL; - boolean isInformativeRead = false; - for( final double likelihood : likelihoods ) { - if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) { - isInformativeRead = true; - break; + likelihoodMap.add(read, a, likelihood); } } - // uninformative reads get the no call Allele - if( isInformativeRead ) { - allele = call.getFirst().getAlleles().get(bestAllele); - } - List readList = alleleReadMap.get(allele); - if( readList == null ) { - readList = new ArrayList(); - alleleReadMap.put(allele, readList); - } - readList.add(read); } } - // add all filtered reads to the NO_CALL list because they weren't given any likelihoods +/* // add all filtered reads to the NO_CALL list because they weren't given any likelihoods List readList = alleleReadMap.get(Allele.NO_CALL); if( readList == null ) { readList = new ArrayList(); alleleReadMap.put(Allele.NO_CALL, readList); } - for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { + */ + /* for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { // only count the read if it overlaps the event, otherwise it is not added to the output read list at all if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { readList.add(read); } } - returnMap.put(sample.getKey(), alleleReadMap); + */ + returnMap.put(sample.getKey(), likelihoodMap); + } return returnMap; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index 56cb6c3d4..71aee44b8 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -201,7 +201,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // compute mean number of reduced read counts in current kmer span final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1); // precise rounding can make a difference with low consensus counts - countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length); + countNumber = MathUtils.arrayMax(counts); + // countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length); } if( !badKmer ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 30f81b20c..a68f0df21 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -51,7 +52,12 @@ public class AlleleBalance extends InfoFieldAnnotation { char[] BASES = {'A','C','G','T'}; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java index c3b6de65a..3cbca4f52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -52,7 +53,12 @@ import java.util.Map; */ public class BaseCounts extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index bd884892c..3f1eaa139 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -21,66 +23,40 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if( isUsableBase(p) ) { - if ( p.getBase() == ref ) - refQuals.add((double)p.getQual()); - else if ( alts.contains(p.getBase()) ) - altQuals.add((double)p.getQual()); - } - } - } - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - // TODO -- implement me; how do we pull out the correct offset from the read? - return; - -/* - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { + protected void fillQualsFromPileup(final List allAlleles, final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, final List altQuals){ + if (alleleLikelihoodMap == null) { + // use fast SNP-based version if we don't have per-read allele likelihoods + for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { - if ( matchesRef ) + if ( allAlleles.get(0).equals(Allele.create(p.getBase())) ) { refQuals.add((double)p.getQual()); - else + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { altQuals.add((double)p.getQual()); - } - } - } -*/ - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry entry : el.entrySet()) { - - if (entry.getKey().isReference()) - refLikelihood = entry.getValue(); - else { - double like = entry.getValue(); - if (like >= altLikelihood) - altLikelihood = like; } } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add(-10.0*refLikelihood); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add(-10.0*altLikelihood); } + return; + } + + for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + if (!isUsableBase(el.getKey())) + continue; + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add(-10.0*(double)el.getValue().get(a)); + else if (allAlleles.contains(a)) + altQuals.add(-10.0*(double)el.getValue().get(a)); + + } } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 54837baad..4ae1a0bba 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -61,7 +62,12 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn private Set founderIds = new HashSet(); - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { if ( ! vc.hasGenotypes() ) return null; @@ -73,13 +79,6 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn founderIds = ((Walker)walker).getSampleDB().getFounderIds(); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( ! vc.hasGenotypes() ) - return null; - - return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); - } - public List getKeyNames() { return Arrays.asList(keyNames); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index f41a40621..fdbbf6732 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -24,68 +25,26 @@ public class ClippingRankSumTest extends RankSumTest { public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - return; - // This working implementation below needs to be tested for the UG pipeline - /* - for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { - if ( p.getBase() == ref ) { - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } else if ( alts.contains(p.getBase()) ) { - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } - } - } - */ - } - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap likelihoodMap, final List refQuals, final List altQuals) { + // todo - only support non-pileup case for now, e.g. active-region based version + if (pileup != null) + return; + + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey().getRead())); + else if (allAlleles.contains(a)) + altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey().getRead())); - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - if ( matchesRef ) - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); - else - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); - } } } - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - return; - // This working implementation below needs to be tested for the UG pipeline - - /* - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Allele a : el.keySet()) { - - if (a.isReference()) - refLikelihood =el.get(a); - else { - double like = el.get(a); - if (like >= altLikelihood) - altLikelihood = like; - } - } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } - } - */ - } -} + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 28ca77f18..8f67414fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; @@ -38,28 +39,30 @@ import java.util.Map; */ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { int depth = 0; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) - depth += sample.getValue().getBasePileup().depthOfCoverage(); - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%d", depth)); - return map; - } + if (stratifiedContexts != null) { + if ( stratifiedContexts.size() == 0 ) + return null; - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - int depth = 0; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final List alleleBin : alleleBins.values() ) { - depth += alleleBin.size(); - } + for ( Map.Entry sample : stratifiedContexts.entrySet() ) + depth += sample.getValue().getBasePileup().depthOfCoverage(); } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; + + for ( Map.Entry sample : perReadAlleleLikelihoodMap.entrySet() ) + depth += sample.getValue().getLikelihoodReadMap().size(); + } + else + return null; Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%d", depth)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index a9edab752..cd8faf093 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -42,7 +42,13 @@ import java.util.List; */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { - public void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, GenotypeBuilder gb) { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb) { if ( g == null || !g.isCalled() ) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 131670599..610d5e7b0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -54,21 +55,29 @@ import java.util.*; public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; - - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isVariant() ) return null; int[][] table; - if ( vc.isSNP() ) + if (stratifiedPerReadAlleleLikelihoodMap != null) { + table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + } + else if (vc.isSNP() && stratifiedContexts != null) { table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); - else if ( vc.isIndel() || vc.isMixed() ) { - table = getIndelContingencyTable(stratifiedContexts); - if (table == null) - return null; } else + // for non-snp variants, we need per-read likelihoods. + // for snps, we can get same result from simple pileup + return null; + + if (table == null) return null; Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); @@ -80,22 +89,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( !vc.isVariant() ) - return null; - - final int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); - - final Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); - if ( pvalue == null ) - return null; - - final Map map = new HashMap(); - map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); - return map; - - } - public List getKeyNames() { return Arrays.asList(FS); } @@ -161,7 +154,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat table[0][1] += 1; table[1][1] -= 1; - return (table[0][0] >= 0 && table[1][1] >= 0) ? true : false; + return (table[0][0] >= 0 && table[1][1] >= 0); } private static boolean unrotateTable(int[][] table) { @@ -171,7 +164,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat table[0][1] -= 1; table[1][1] += 1; - return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false; + return (table[0][1] >= 0 && table[1][0] >= 0); } private static double computePValue(int[][] table) { @@ -218,31 +211,29 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable(Map>> stratifiedContexts, Allele ref, Allele alt) { + private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, + final Allele ref, final Allele alt) { int[][] table = new int[2][2]; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref); + final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt); - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alt.equals(alleleBin.getKey()); if ( !matchesRef && !matchesAlt ) continue; - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - boolean isFW = read.getReadNegativeStrandFlag(); + boolean isFW = el.getKey().getRead().getReadNegativeStrandFlag(); - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; - table[row][column]++; - } + table[row][column]++; } } return table; } - /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -275,69 +266,5 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat return table; } - /** - Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc - * allele1 # # - * allele2 # # - * @return a 2x2 contingency table - */ - private static int[][] getIndelContingencyTable(Map stratifiedContexts) { - final double INDEL_LIKELIHOOD_THRESH = 0.3; - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - if (indelLikelihoodMap == null) - return null; - - int[][] table = new int[2][2]; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - final AlignmentContext context = sample.getValue(); - if ( context == null ) - continue; - - final ReadBackedPileup pileup = context.getBasePileup(); - for ( final PileupElement p : pileup ) { - if ( ! RankSumTest.isUsableBase(p, true) || p.getRead().isReducedRead() ) // ignore reduced reads - continue; - if ( indelLikelihoodMap.containsKey(p) ) { - // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. - // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). - // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. - // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pileup element is "ref" - // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry entry : el.entrySet()) { - - if (entry.getKey().isReference()) - refLikelihood = entry.getValue(); - else { - double like = entry.getValue(); - if (like >= altLikelihood) - altLikelihood = like; - } - } - - boolean matchesRef = (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)); - boolean matchesAlt = (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)); - if ( matchesRef || matchesAlt ) { - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - - table[row][column]++; - } - - - } - } - } - - return table; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index fba30b3f7..3fe5c5837 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -25,7 +26,12 @@ import java.util.Map; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { double content = computeGCContent(ref); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", content)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index c6d8883c5..e01c51f4b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -60,7 +61,12 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; private final static char REGEXP_WILDCARD = '.'; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here return null; @@ -88,7 +94,9 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if (vc.isSNP()) scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense else if (vc.isIndel() || vc.isMixed()) { - Double d = scoreIndelsAgainstHaplotypes(thisPileup); + if (stratifiedPerReadAlleleLikelihoodMap == null) + return null; + Double d = scoreIndelsAgainstHaplotypes(stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName())); if (d == null) return null; scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense @@ -177,7 +185,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); - int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); @@ -189,7 +196,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); + final int readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { @@ -346,31 +353,26 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } - private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { + private Double scoreIndelsAgainstHaplotypes(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { final ArrayList haplotypeScores = new ArrayList(); - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - - if (indelLikelihoodMap == null) + if (perReadAlleleLikelihoodMap.isEmpty()) return null; - for (final PileupElement p : pileup) { - if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); + for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - // Score all the reads in the pileup, even the filtered ones - final double[] scores = new double[el.size()]; - int i = 0; - for (Map.Entry a : el.entrySet()) { - scores[i++] = -a.getValue(); - if (DEBUG) { - System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); - } + // retrieve likelihood information corresponding to this read + // Score all the reads in the pileup, even the filtered ones + final double[] scores = new double[el.getValue().size()]; + int i = 0; + for (Map.Entry a : el.getValue().entrySet()) { + scores[i++] = -a.getValue(); + if (DEBUG) { + System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); } - - haplotypeScores.add(scores); } + + haplotypeScores.add(scores); } // indel likelihoods are strict log-probs, not phred scored diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index 6ba85de07..06fa04526 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -29,7 +30,12 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress private static final int MIN_GENOTYPE_QUALITY = 10; private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index 9f20bf375..5891cbc69 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -22,7 +23,12 @@ public class HomopolymerRun extends InfoFieldAnnotation { private boolean ANNOTATE_INDELS = true; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isBiallelic() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 715895526..64be64afa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -33,17 +34,18 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private static final int MIN_SAMPLES = 10; private Set founderIds; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { //If available, get the founder IDs and cache them. the IC will only be computed on founders then. - if(founderIds == null) + if(founderIds == null && walker != null) founderIds = ((Walker)walker).getSampleDB().getFounderIds(); return calculateIC(vc); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - return calculateIC(vc); - } - private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java index babaf7ee6..5f405cb46 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.IndelUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -18,9 +19,14 @@ import java.util.*; */ public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { - int run; + int run; if (vc.isMixed()) { Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%s", "MIXED")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java index 7f5033adf..4be601bc8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -21,7 +22,12 @@ import java.util.Map; */ public class LowMQ extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b6f24433e..3136a696d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -32,7 +33,12 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment private String fatherId; private String childId; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( mendelianViolation == null ) { if (checkAndSetSamples(((Walker) walker).getSampleDB())) { mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 31067e386..ef0c8ab4f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -2,11 +2,13 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -23,60 +25,39 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { - if ( p.getBase() == ref ) { - refQuals.add((double)p.getMappingQual()); - } else if ( alts.contains(p.getBase()) ) { - altQuals.add((double)p.getMappingQual()); - } - } - } - } + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap likelihoodMap, + final List refQuals, final List altQuals) { - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - if ( matchesRef ) - refQuals.add((double)read.getMappingQuality()); - else - altQuals.add((double)read.getMappingQuality()); - } - } - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry a : el.entrySet()) { - - if (a.getKey().isReference()) - refLikelihood = a.getValue(); - else { - double like = a.getValue(); - if (like >= altLikelihood) - altLikelihood = like; + if (pileup != null && likelihoodMap == null) { + // no per-read likelihoods available: + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + if ( allAlleles.get(0).equals(Allele.create(p.getBase())) ) { + refQuals.add((double)p.getMappingQual()); + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { + altQuals.add((double)p.getMappingQual()); } } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add((double)p.getMappingQual()); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add((double)p.getMappingQual()); } + return; + } + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { + if (!isUsableBase(el.getKey())) + continue; + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)el.getKey().getMappingQual()); + else if (allAlleles.contains(a)) + altQuals.add((double)el.getKey().getMappingQual()); + + } } - -} \ No newline at end of file + + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index 372d5bc9e..c3cb01c23 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; @@ -24,7 +25,12 @@ import java.util.Map; */ public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java index 9f542e3bd..21ee66ea2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -22,7 +23,12 @@ import java.util.Map; */ public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java index ba4303b4a..8e4edaf0e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -20,7 +21,12 @@ import java.util.Map; * The number of N bases, counting only SOLiD data */ public class NBaseCount extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index b62cd374b..f94d51bc8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -28,14 +29,24 @@ import java.util.Map; */ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( !vc.hasLog10PError() || stratifiedContexts.size() == 0 ) + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + if ( !vc.hasLog10PError() ) return null; final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() == 0 ) return null; + if (stratifiedContexts != null && stratifiedContexts.size() == 0) + return null; + if (perReadAlleleLikelihoodMap != null && perReadAlleleLikelihoodMap.size() == 0) + return null; + int depth = 0; for ( final Genotype genotype : genotypes ) { @@ -44,11 +55,20 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( !genotype.isHet() && !genotype.isHomVar() ) continue; - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; + if (stratifiedContexts!= null) { + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + depth += context.getBasePileup().depthOfCoverage(); - depth += context.getBasePileup().depthOfCoverage(); + } + else if (perReadAlleleLikelihoodMap != null) { + PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) + continue; + + depth += perReadAlleleLikelihoods.getLikelihoodReadMap().size(); + } } if ( depth == 0 ) @@ -67,39 +87,5 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - int depth = 0; - - for ( final Genotype genotype : genotypes ) { - - // we care only about variant calls with likelihoods - if ( !genotype.isHet() && !genotype.isHomVar() ) - continue; - - final Map> alleleBins = stratifiedContexts.get(genotype.getSampleName()); - if ( alleleBins == null ) - continue; - - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { - depth += alleleBin.getValue().size(); - } - } - - if ( depth == 0 ) - return null; - - double QD = -10.0 * vc.getLog10PError() / (double)depth; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", QD)); - return map; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 842fde8ad..21b91b4b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; @@ -18,10 +19,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -29,25 +27,48 @@ import java.util.Map; */ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + int totalSize = 0, index = 0; + int qualities[]; + if (stratifiedContexts != null) { + if ( stratifiedContexts.size() == 0 ) + return null; - int totalSize = 0; - for ( AlignmentContext context : stratifiedContexts.values() ) - totalSize += context.size(); + for ( AlignmentContext context : stratifiedContexts.values() ) + totalSize += context.size(); - final int[] qualities = new int[totalSize]; - int index = 0; + qualities = new int[totalSize]; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - for (PileupElement p : pileup ) { - if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) - qualities[index++] = p.getMappingQual(); + for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + AlignmentContext context = sample.getValue(); + for (PileupElement p : context.getBasePileup() ) + index = fillMappingQualitiesFromPileupAndUpdateIndex(p, index, qualities); } } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; + + for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) + totalSize += perReadLikelihoods.size(); + + qualities = new int[totalSize]; + for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) { + for (PileupElement p : perReadLikelihoods.getStoredPileupElements()) + index = fillMappingQualitiesFromPileupAndUpdateIndex(p, index, qualities); + + + } + } + else + return null; + + double rms = MathUtils.rms(qualities); Map map = new HashMap(); @@ -55,32 +76,12 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + private static int fillMappingQualitiesFromPileupAndUpdateIndex(final PileupElement p, final int inputIdx, final int[] qualities) { + int outputIdx = inputIdx; + if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[outputIdx++] = p.getMappingQual(); - int depth = 0; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { - depth += alleleBin.getValue().size(); - } - } - - final int[] qualities = new int[depth]; - int index = 0; - - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final List reads : alleleBins.values() ) { - for ( final GATKSAMRecord read : reads ) { - if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) - qualities[index++] = read.getMappingQuality(); - } - } - } - - final Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", MathUtils.rms(qualities))); - return map; + return outputIdx; } public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index bf6adcfac..474625fff 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.Pair; @@ -28,12 +29,15 @@ import java.util.Map; * Abstract root for all RankSum based annotations */ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { - static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null final GenotypesContext genotypes = vc.getGenotypes(); if (genotypes == null || genotypes.size() == 0) @@ -42,39 +46,24 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); - if ( vc.isSNP() ) { - final List altAlleles = new ArrayList(); - for ( final Allele a : vc.getAlternateAlleles() ) - altAlleles.add(a.getBases()[0]); - - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + PerReadAlleleLikelihoodMap indelLikelihoodMap = null; + ReadBackedPileup pileup = null; + if (stratifiedPerReadAlleleLikelihoodMap != null && !stratifiedPerReadAlleleLikelihoodMap.isEmpty()) { + indelLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (indelLikelihoodMap == null) + continue; + if (indelLikelihoodMap.isEmpty()) + continue; + } + else if (stratifiedContexts != null) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; - - fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals); + pileup = context.getBasePileup(); } - } else if ( vc.isIndel() || vc.isMixed() ) { - - for (final Genotype genotype : genotypes.iterateInSampleNameOrder()) { - final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if (context == null) { - continue; - } - - final ReadBackedPileup pileup = context.getBasePileup(); - if (pileup == null) - continue; - - if (IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap() == null || - IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().size() == 0) - return null; - - fillIndelQualsFromPileup(pileup, refQuals, altQuals); - } - } else - return null; - + fillQualsFromPileup(vc.getAlleles(), vc.getStart(), pileup, indelLikelihoodMap, refQuals, altQuals ); + } final MannWhitneyU mannWhitneyU = new MannWhitneyU(); for (final Double qual : altQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); @@ -103,50 +92,12 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if (genotypes == null || genotypes.size() == 0) - return null; - - final ArrayList refQuals = new ArrayList(); - final ArrayList altQuals = new ArrayList(); - - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { - final Map> context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - - fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), vc.getStart(), context, refQuals, altQuals); - } - - if ( refQuals.size() == 0 || altQuals.size() == 0 ) - return null; - - final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for (final Double qual : altQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); - } - for (final Double qual : refQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); - } - - // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); - - final Map map = new HashMap(); - if (!Double.isNaN(testResults.first)) - map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); - return map; - } - - protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, List altQuals); - - protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); - - protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List refQuals, final List altQuals); + protected abstract void fillQualsFromPileup(final List alleles, + final int refLoc, + final ReadBackedPileup readBackedPileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, + final List altQuals); /** * Can the base in this pileup element be used in comparative tests between ref / alt bases? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 3456041c7..a1b8bcfc8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -6,6 +6,7 @@ import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -32,98 +33,55 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for (final PileupElement p : pileup) { - if (isUsableBase(p)) { - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); - final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); - if (readPos > numAlignedBases / 2) - readPos = numAlignedBases - (readPos + 1); + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, final List altQuals) { + if (alleleLikelihoodMap == null) { + // use fast SNP-based version if we don't have per-read allele likelihoods + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); - if ( p.getBase() == ref ) - refQuals.add((double) readPos); - else if ( alts.contains(p.getBase()) ) - altQuals.add((double) readPos); - } - } - } + readPos = getFinalReadPosition(p.getRead(),readPos); - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); - if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) - continue; - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); - - final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); - if (readPos > numAlignedBases / 2) - readPos = numAlignedBases - (readPos + 1); - - if ( matchesRef ) - refQuals.add((double) readPos); - else - altQuals.add((double) readPos); - } - } - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele - // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. - // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). - // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. - // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref" - // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p : pileup) { - if (indelLikelihoodMap.containsKey(p)) { - LinkedHashMap el = indelLikelihoodMap.get(p); // retrieve likelihood information corresponding to this read - double refLikelihood = 0.0, altLikelihood = Double.NEGATIVE_INFINITY; // by design, first element in LinkedHashMap was ref allele - - for (Map.Entry a : el.entrySet()) { - if (a.getKey().isReference()) - refLikelihood = a.getValue(); - else { - double like = a.getValue(); - if (like >= altLikelihood) - altLikelihood = like; + if ( allAlleles.get(0).equals(Allele.create(p.getBase())) ) { + refQuals.add((double)readPos); + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { + altQuals.add((double)readPos); } } - - int readPos = getOffsetFromClippedReadStart(p.getRead(), p.getOffset()); - final int numAlignedBases = getNumAlignedBases(p.getRead()); - - if (readPos > numAlignedBases / 2) { - readPos = numAlignedBases - (readPos + 1); - } - //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); - - - // if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads - // where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping. - // if (readPos < -1) - // return; - if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { - refQuals.add((double) readPos); - //if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos); - } else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { - altQuals.add((double) readPos); - //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); - - } - - } + return; + } + + for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + int readPos = getOffsetFromClippedReadStart(el.getKey().getRead(), el.getKey().getOffset()); + readPos = getFinalReadPosition(el.getKey().getRead(),readPos); + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)readPos); + else if (allAlleles.contains(a)) + altQuals.add((double)readPos); + } } + int getFinalReadPosition(GATKSAMRecord read, int initialReadPosition) { + final int numAlignedBases = getNumAlignedBases(read); + + int readPos = initialReadPosition; + if (initialReadPosition > numAlignedBases / 2) { + readPos = numAlignedBases - (initialReadPosition + 1); + } + return readPos; + + } int getNumClippedBasesAtStart(SAMRecord read) { // compute total number of clipped bases (soft or hard clipped) // check for hard clips (never consider these bases): diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index 7e4d44cf2..abe55de5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -46,7 +47,12 @@ import java.util.Map; */ public class SampleList extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 4d990e738..f0bd7ecd9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -225,7 +226,12 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue())); } - public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); // Get only SnpEff records that start at this locus, not merely span it: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index af2df8e6a..f6bb4e747 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -22,7 +23,12 @@ import java.util.Map; */ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index eced387b3..439402e2f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -47,7 +48,12 @@ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements Standa private static final String STR_PRESENT = "STR"; private static final String REPEAT_UNIT_KEY = "RU"; private static final String REPEATS_PER_ALLELE_KEY = "RPA"; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isIndel()) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java index 63694d809..43ef188a8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -28,7 +29,12 @@ public class TechnologyComposition extends InfoFieldAnnotation implements Experi private String n454 ="Num454"; private String nSolid = "NumSOLiD"; private String nOther = "NumOther"; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index 2e3578dcb..c3e98c20f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -28,7 +29,12 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen private Set trios = null; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( trios == null ) { if ( walker instanceof VariantAnnotator ) { trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 073faf54e..f30fb4109 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -178,7 +179,18 @@ public class VariantAnnotatorEngine { this.requireStrictAlleleMatch = requireStrictAlleleMatch; } - public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + VariantContext vc) { + return annotateContext(tracker, ref, stratifiedContexts, vc, null); + } + + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + VariantContext vc, + final Map perReadAlleleLikelihoodMap) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences @@ -189,7 +201,7 @@ public class VariantAnnotatorEngine { // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc); + Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); if ( annotationsFromCurrentType != null ) infoAnnotations.putAll(annotationsFromCurrentType); } @@ -201,7 +213,7 @@ public class VariantAnnotatorEngine { return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); } - public VariantContext annotateContext(final Map>> stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final Map stratifiedContexts, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // go through all the requested info annotationTypes diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java index de61c7741..7af4baddb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -10,8 +11,8 @@ import java.util.Map; // TODO -- make this an abstract class when we move away from InfoFieldAnnotation public interface ActiveRegionBasedAnnotation extends AnnotationType { - // return annotations for the given contexts split by sample and then allele - public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc); + // return annotations for the given contexts split by sample and then read likelihoof + public abstract Map annotate(final Map stratifiedContexts, final VariantContext vc); // return the descriptions used for the VCF INFO meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java index 1569a605f..738be9883 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -11,8 +12,25 @@ import java.util.Map; public abstract class InfoFieldAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts split by sample - public abstract Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, - ReferenceContext ref, Map stratifiedContexts, VariantContext vc); + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { + return annotate(tracker, walker, ref, stratifiedContexts, vc, null); + } + + public Map annotate(Map perReadAlleleLikelihoodMap, VariantContext vc) { + return annotate(null, null, null, null, vc, perReadAlleleLikelihoodMap); + } + + + public abstract Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap); // return the descriptions used for the VCF INFO meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 6fdc926d5..77da35c41 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -103,7 +103,8 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser); + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap); protected int getFilteredDepth(ReadBackedPileup pileup) { @@ -115,4 +116,5 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { return count; } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index bedffa690..ebfbc49fe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -48,24 +48,11 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; - private static ThreadLocal>> indelLikelihoodMap = - new ThreadLocal>>() { - protected synchronized HashMap> initialValue() { - return new HashMap>(); - } - }; private LinkedHashMap haplotypeMap; - // gdebug removeme - // todo -cleanup - private GenomeLoc lastSiteVisited; private List alleleList = new ArrayList(); - static { - indelLikelihoodMap.set(new HashMap>()); - } - protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); @@ -93,16 +80,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { GenomeLoc loc = ref.getLocus(); // if (!ref.getLocus().equals(lastSiteVisited)) { if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { // starting a new site: clear allele list - lastSiteVisited = ref.getLocus(); - indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); - + perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); if (alleleList.isEmpty()) return null; @@ -130,10 +116,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ + // no likelihoods have been computed for this sample at this site + perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap()); + } final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey())); b.PL(genotypeLikelihoods); b.DP(getFilteredDepth(pileup)); genotypes.add(b.make()); @@ -150,10 +140,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return builder.genotypes(genotypes).make(); } - public static HashMap> getIndelLikelihoodMap() { - return indelLikelihoodMap.get(); - } - public static void getHaplotypeMapFromAlleles(final List alleleList, final ReferenceContext ref, final GenomeLoc loc, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java new file mode 100644 index 000000000..a704afba9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.genotyper; + + +//import org.broadinstitute.sting.gatk.walkers.Requires; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.*; + +public class PerReadAlleleLikelihoodMap { + public static final double INDEL_LIKELIHOOD_THRESH = 0.1; + + private List alleles; + private Map> likelihoodReadMap; + public PerReadAlleleLikelihoodMap() { + likelihoodReadMap = new LinkedHashMap>(); + alleles = new ArrayList(); + } + + public void add(PileupElement p, Allele a, Double likelihood) { + Map likelihoodMap; + if (likelihoodReadMap.containsKey(p)){ + // seen pileup element before + likelihoodMap = likelihoodReadMap.get(p); + } + else { + likelihoodMap = new HashMap(); + likelihoodReadMap.put(p,likelihoodMap); + } + likelihoodMap.put(a,likelihood); + + if (!alleles.contains(a)) + alleles.add(a); + + } + + public int size() { + return likelihoodReadMap.size(); + } + + public void add(GATKSAMRecord read, Allele a, Double likelihood) { + PileupElement p = new PileupElement(read,-1,false,false,false,false,false,false); + add(p,a,likelihood); + } + + public boolean containsPileupElement(PileupElement p) { + return likelihoodReadMap.containsKey(p); + } + + public boolean isEmpty() { + return likelihoodReadMap.isEmpty(); + } + + public Map> getLikelihoodReadMap() { + return likelihoodReadMap; + } + public void clear() { + alleles.clear(); + likelihoodReadMap.clear(); + } + + public Set getStoredPileupElements() { + return likelihoodReadMap.keySet(); + } + /** + * Returns list of reads greedily associated with a particular allele. + * Needs to loop for each read, and assign to each allele + * @param a Desired allele + * @return + */ + @Requires("a!=null") + public List getReadsAssociatedWithAllele(Allele a) { + return null; + } + + public Map getLikelihoodsAssociatedWithPileupElement(PileupElement p) { + if (!likelihoodReadMap.containsKey(p)) + return null; + + return likelihoodReadMap.get(p); + } + + public static Allele getMostLikelyAllele(Map alleleMap) { + double minLike = Double.POSITIVE_INFINITY, maxLike = Double.NEGATIVE_INFINITY; + Allele mostLikelyAllele = Allele.NO_CALL; + + for (Map.Entry el : alleleMap.entrySet()) { + if (el.getValue() > maxLike) { + maxLike = el.getValue(); + mostLikelyAllele = el.getKey(); + } + + if (el.getValue() < minLike) + minLike = el.getValue(); + + } + if (maxLike-minLike > INDEL_LIKELIHOOD_THRESH) + return mostLikelyAllele; + else + return Allele.NO_CALL; + } + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 07d5d2f2d..76ba72017 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -62,7 +62,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { + + perReadAlleleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data final byte refBase = ref.getBase(); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index f15fa9b99..0b218865c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -177,19 +177,23 @@ public class UnifiedGenotyperEngine { final List results = new ArrayList(2); final List models = getGLModelsToUse(tracker, refContext, rawContext); + + final Map perReadAlleleLikelihoodMap = new HashMap(); + if ( models.isEmpty() ) { results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); } else { for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + perReadAlleleLikelihoodMap.clear(); final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); if ( stratifiedContexts == null ) { results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext) : null); } else { - final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true)); + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); } } } @@ -219,9 +223,13 @@ public class UnifiedGenotyperEngine { * @param tracker the meta data tracker * @param refContext the reference base * @param rawContext contextual information around the locus + * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) * @return the VariantContext object */ - public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map perReadAlleleLikelihoodMap) { final List models = getGLModelsToUse(tracker, refContext, rawContext); if ( models.isEmpty() ) { return null; @@ -231,7 +239,7 @@ public class UnifiedGenotyperEngine { final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); // return the first valid one we encounter if ( stratifiedContexts != null ) - return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); } @@ -247,7 +255,11 @@ public class UnifiedGenotyperEngine { * @param vc the GL-annotated variant context * @return the VariantCallContext object */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap) { final List models = getGLModelsToUse(tracker, refContext, rawContext); if ( models.isEmpty() ) { return null; @@ -256,25 +268,37 @@ public class UnifiedGenotyperEngine { // return the first one final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, perReadAlleleLikelihoodMap); } - - // --------------------------------------------------------------------------------------------------------- + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final VariantContext vc) { + return calculateGenotypes(tracker, refContext, rawContext, vc, null); + } + // --------------------------------------------------------------------------------------------------------- // // Private implementation helpers // // --------------------------------------------------------------------------------------------------------- // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine - private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, AlignmentContextUtils.ReadOrientation type, List alternateAllelesToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) { + private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final Map stratifiedContexts, + final AlignmentContextUtils.ReadOrientation type, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { // initialize the data for this thread if that hasn't been done yet if ( glcm.get() == null ) { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); + return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -305,12 +329,22 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, false); } - public VariantCallContext calculateGenotypes(VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(null, null, null, null, vc, model); + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); } - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false); + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(null, null, null, null, vc, model, null); + } + + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map stratifiedContexts, + final VariantContext vc, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false,perReadAlleleLikelihoodMap); } /** @@ -324,8 +358,11 @@ public class UnifiedGenotyperEngine { * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final boolean inheritAttributesFromInputVC) { + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext, Map stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; @@ -451,7 +488,7 @@ public class UnifiedGenotyperEngine { List allAllelesToUse = builder.make().getAlleles(); // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model); + VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult.reset(); afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); @@ -460,7 +497,7 @@ public class UnifiedGenotyperEngine { //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model); + VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); AFresult.reset(); afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); @@ -496,7 +533,7 @@ public class UnifiedGenotyperEngine { final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 65c5a2fbc..9234a9fe8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.PairHMM; @@ -40,6 +41,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; public class PairHMMIndelErrorModel { @@ -167,11 +169,15 @@ public class PairHMMIndelErrorModel { } - public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ final int numHaplotypes = haplotypeMap.size(); final int readCounts[] = new int[pileup.getNumberOfElements()]; - final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts); + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); } @@ -181,7 +187,7 @@ public class PairHMMIndelErrorModel { final LinkedHashMap haplotypeMap, final ReferenceContext ref, final int eventLength, - final HashMap> indelLikelihoodMap, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; final PairHMM pairHMM = new PairHMM(bandedLikelihoods); @@ -192,8 +198,8 @@ public class PairHMMIndelErrorModel { readCounts[readIdx] = p.getRepresentativeCount(); // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) - if (indelLikelihoodMap.containsKey(p)) { - HashMap el = indelLikelihoodMap.get(p); + if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { + Map el = perReadAlleleLikelihoodMap.getLikelihoodsAssociatedWithPileupElement(p); int j=0; for (Allele a: haplotypeMap.keySet()) { readLikelihoods[readIdx][j++] = el.get(a); @@ -201,7 +207,7 @@ public class PairHMMIndelErrorModel { } else { final int refWindowStart = ref.getWindow().getStart(); - final int refWindowStop = ref.getWindow().getStop(); + final int refWindowStop = ref.getWindow().getStop(); if (DEBUG) { System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); @@ -280,7 +286,7 @@ public class PairHMMIndelErrorModel { System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); - LinkedHashMap readEl = new LinkedHashMap(); + // LinkedHashMap readEl = new LinkedHashMap(); /** * Check if we'll end up with an empty read once all clipping is done @@ -288,7 +294,7 @@ public class PairHMMIndelErrorModel { if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { - readEl.put(a,0.0); + perReadAlleleLikelihoodMap.add(p,a,0.0); readLikelihoods[readIdx][j++] = 0.0; } } @@ -329,45 +335,45 @@ public class PairHMMIndelErrorModel { - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), - (int)indStart, (int)indStop); + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), + (int)indStart, (int)indStop); - final int X_METRIC_LENGTH = readBases.length+2; - final int Y_METRIC_LENGTH = haplotypeBases.length+2; + final int X_METRIC_LENGTH = readBases.length+2; + final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { - //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + if (matchMetricArray == null) { + //no need to reallocate arrays for each new haplotype, as length won't change + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - } + PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); + } - int startIndexInHaplotype = 0; - if (previousHaplotypeSeen != null) - startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - previousHaplotypeSeen = haplotypeBases.clone(); + int startIndexInHaplotype = 0; + if (previousHaplotypeSeen != null) + startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + previousHaplotypeSeen = haplotypeBases.clone(); - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, - (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), - (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, + (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), + (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), + contextLogGapContinuationProbabilities, + startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - if (DEBUG) { - System.out.println("H:"+new String(haplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIndexInHaplotype); - } - readEl.put(a,readLikelihood); + if (DEBUG) { + System.out.println("H:"+new String(haplotypeBases)); + System.out.println("R:"+new String(readBases)); + System.out.format("L:%4.2f\n",readLikelihood); + System.out.format("StPos:%d\n", startIndexInHaplotype); + } + + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } } - indelLikelihoodMap.put(p,readEl); } readIdx++; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 96704f0b8..8fc5105e5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -625,6 +625,10 @@ public class MathUtils { return maxElementIndex(array, array.length); } + public static int maxElementIndex(final byte[] array) { + return maxElementIndex(array, array.length); + } + public static int maxElementIndex(final int[] array, int endIndex) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -638,6 +642,24 @@ public class MathUtils { return maxI; } + public static int maxElementIndex(final byte[] array, int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static byte arrayMax(final byte[] array) { + return array[maxElementIndex(array)]; + } + + public static double arrayMax(final double[] array) { return array[maxElementIndex(array)]; } From 963ad03f8be0ee0d7c054fe1d242b78864103cb3 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 19 Aug 2012 21:18:18 -0400 Subject: [PATCH 02/32] Second step of interface cleanup for variant annotator: several bug fixes, don't hash pileup elements to Maps because the hashCode() for a pileup element is not implemented and strange things can happen. Still several things to do, not done yet --- .../gatk/walkers/genotyper/ErrorModel.java | 8 ++--- ...NPGenotypeLikelihoodsCalculationModel.java | 3 +- .../annotator/BaseQualityRankSumTest.java | 13 +++---- .../annotator/ClippingRankSumTest.java | 8 ++--- .../walkers/annotator/DepthOfCoverage.java | 2 +- .../gatk/walkers/annotator/FisherStrand.java | 19 +++++----- .../walkers/annotator/HaplotypeScore.java | 6 ++-- .../annotator/MappingQualityRankSumTest.java | 11 +++--- .../gatk/walkers/annotator/QualByDepth.java | 7 +--- .../walkers/annotator/RMSMappingQuality.java | 12 +++---- .../gatk/walkers/annotator/RankSumTest.java | 27 +++++++------- .../walkers/annotator/ReadPosRankSumTest.java | 17 ++++++--- .../genotyper/PerReadAlleleLikelihoodMap.java | 35 +++++++++++-------- 13 files changed, 90 insertions(+), 78 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index 26ff4db24..311d66d81 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -53,13 +53,14 @@ public class ErrorModel { PairHMMIndelErrorModel pairModel = null; LinkedHashMap haplotypeMap = null; - HashMap> indelLikelihoodMap = null; double[][] perReadLikelihoods = null; double[] model = new double[maxQualityScore+1]; Arrays.fill(model,Double.NEGATIVE_INFINITY); boolean hasCalledAlleles = false; + + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap(); if (refSampleVC != null) { for (Allele allele : refSampleVC.getAlleles()) { @@ -72,7 +73,6 @@ public class ErrorModel { if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - indelLikelihoodMap = new HashMap>(); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } } @@ -92,12 +92,12 @@ public class ErrorModel { Allele refAllele = refSampleVC.getReference(); - if (refSampleVC.isIndel()) { + if ( refSampleVC.isIndel()) { final int readCounts[] = new int[refSamplePileup.getNumberOfElements()]; //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); if (!haplotypeMap.isEmpty()) - perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts); + perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); } int idx = 0; for (PileupElement refPileupElement : refSamplePileup) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java index 30d614455..4376ec601 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java @@ -49,7 +49,8 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, - final boolean ignoreLaneInformation) { + final boolean ignoreLaneInformation, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){ return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index 3f1eaa139..dc727fa48 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -32,7 +32,7 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot // use fast SNP-based version if we don't have per-read allele likelihoods for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { - if ( allAlleles.get(0).equals(Allele.create(p.getBase())) ) { + if ( allAlleles.get(0).equals(Allele.create(p.getBase(),true)) ) { refQuals.add((double)p.getQual()); } else if ( allAlleles.contains(Allele.create(p.getBase()))) { altQuals.add((double)p.getQual()); @@ -42,17 +42,14 @@ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnot return; } - for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - if (!isUsableBase(el.getKey())) - continue; - - final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + for (Map el : alleleLikelihoodMap.getLikelihoodMapValues()) { + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) - refQuals.add(-10.0*(double)el.getValue().get(a)); + refQuals.add(-10.0*(double)el.get(a)); else if (allAlleles.contains(a)) - altQuals.add(-10.0*(double)el.getValue().get(a)); + altQuals.add(-10.0*(double)el.get(a)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index fdbbf6732..449e047cd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -31,18 +31,18 @@ public class ClippingRankSumTest extends RankSumTest { final ReadBackedPileup pileup, final PerReadAlleleLikelihoodMap likelihoodMap, final List refQuals, final List altQuals) { // todo - only support non-pileup case for now, e.g. active-region based version - if (pileup != null) + if (pileup != null || likelihoodMap == null) return; - for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey().getRead())); + refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); else if (allAlleles.contains(a)) - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey().getRead())); + altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 8f67414fa..5865de2c1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -59,7 +59,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno return null; for ( Map.Entry sample : perReadAlleleLikelihoodMap.entrySet() ) - depth += sample.getValue().getLikelihoodReadMap().size(); + depth += sample.getValue().getNumberOfStoredElements(); } else return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 610d5e7b0..ad0ad50b0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -66,12 +66,13 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int[][] table; - if (stratifiedPerReadAlleleLikelihoodMap != null) { - table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); - } - else if (vc.isSNP() && stratifiedContexts != null) { + if (vc.isSNP() && stratifiedContexts != null) { table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); } + else if (stratifiedPerReadAlleleLikelihoodMap != null) { + // either SNP with no alignment context, or indels: per-read likelihood map needed + table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); + } else // for non-snp variants, we need per-read likelihoods. // for snps, we can get same result from simple pileup @@ -216,14 +217,16 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int[][] table = new int[2][2]; for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { - for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref); - final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt); + for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + if ( el.getKey().isReducedRead() ) // ignore reduced reads + continue; + final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true); + final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true); if ( !matchesRef && !matchesAlt ) continue; - boolean isFW = el.getKey().getRead().getReadNegativeStrandFlag(); + boolean isFW = el.getKey().getReadNegativeStrandFlag(); int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index e01c51f4b..b784bfe08 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -359,13 +359,13 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot if (perReadAlleleLikelihoodMap.isEmpty()) return null; - for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + for (Map el : perReadAlleleLikelihoodMap.getLikelihoodMapValues()) { // retrieve likelihood information corresponding to this read // Score all the reads in the pileup, even the filtered ones - final double[] scores = new double[el.getValue().size()]; + final double[] scores = new double[el.size()]; int i = 0; - for (Map.Entry a : el.getValue().entrySet()) { + for (Map.Entry a : el.entrySet()) { scores[i++] = -a.getValue(); if (DEBUG) { System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index ef0c8ab4f..6557f3e47 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -35,7 +35,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn // no per-read likelihoods available: for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { - if ( allAlleles.get(0).equals(Allele.create(p.getBase())) ) { + if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { refQuals.add((double)p.getMappingQual()); } else if ( allAlleles.contains(Allele.create(p.getBase()))) { altQuals.add((double)p.getMappingQual()); @@ -44,17 +44,14 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn } return; } - for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { - if (!isUsableBase(el.getKey())) - continue; - + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) - refQuals.add((double)el.getKey().getMappingQual()); + refQuals.add((double)el.getKey().getMappingQuality()); else if (allAlleles.contains(a)) - altQuals.add((double)el.getKey().getMappingQual()); + altQuals.add((double)el.getKey().getMappingQuality()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index f94d51bc8..a48d4a678 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -42,11 +42,6 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( genotypes == null || genotypes.size() == 0 ) return null; - if (stratifiedContexts != null && stratifiedContexts.size() == 0) - return null; - if (perReadAlleleLikelihoodMap != null && perReadAlleleLikelihoodMap.size() == 0) - return null; - int depth = 0; for ( final Genotype genotype : genotypes ) { @@ -67,7 +62,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) continue; - depth += perReadAlleleLikelihoods.getLikelihoodReadMap().size(); + depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 21b91b4b2..680478da0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -47,7 +47,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn for ( Map.Entry sample : stratifiedContexts.entrySet() ) { AlignmentContext context = sample.getValue(); for (PileupElement p : context.getBasePileup() ) - index = fillMappingQualitiesFromPileupAndUpdateIndex(p, index, qualities); + index = fillMappingQualitiesFromPileupAndUpdateIndex(p.getRead(), index, qualities); } } else if (perReadAlleleLikelihoodMap != null) { @@ -59,8 +59,8 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn qualities = new int[totalSize]; for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) { - for (PileupElement p : perReadLikelihoods.getStoredPileupElements()) - index = fillMappingQualitiesFromPileupAndUpdateIndex(p, index, qualities); + for (GATKSAMRecord read : perReadLikelihoods.getStoredElements()) + index = fillMappingQualitiesFromPileupAndUpdateIndex(read, index, qualities); } @@ -76,10 +76,10 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } - private static int fillMappingQualitiesFromPileupAndUpdateIndex(final PileupElement p, final int inputIdx, final int[] qualities) { + private static int fillMappingQualitiesFromPileupAndUpdateIndex(final GATKSAMRecord read, final int inputIdx, final int[] qualities) { int outputIdx = inputIdx; - if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) - qualities[outputIdx++] = p.getMappingQual(); + if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[outputIdx++] = read.getMappingQuality(); return outputIdx; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 474625fff..fb9f8603e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -49,19 +49,22 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { PerReadAlleleLikelihoodMap indelLikelihoodMap = null; ReadBackedPileup pileup = null; - if (stratifiedPerReadAlleleLikelihoodMap != null && !stratifiedPerReadAlleleLikelihoodMap.isEmpty()) { - indelLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if (indelLikelihoodMap == null) - continue; - if (indelLikelihoodMap.isEmpty()) - continue; - } - else if (stratifiedContexts != null) { + + + if (stratifiedContexts != null) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - pileup = context.getBasePileup(); + if ( context != null ) + pileup = context.getBasePileup(); } + if (stratifiedPerReadAlleleLikelihoodMap != null ) + indelLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); + + if (indelLikelihoodMap != null && indelLikelihoodMap.isEmpty()) + indelLikelihoodMap = null; + // treat an empty likelihood map as a null reference - will simplify contract with fillQualsFromPileup + if (indelLikelihoodMap == null && pileup == null) + continue; + fillQualsFromPileup(vc.getAlleles(), vc.getStart(), pileup, indelLikelihoodMap, refQuals, altQuals ); } final MannWhitneyU mannWhitneyU = new MannWhitneyU(); @@ -92,7 +95,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR return map; } - protected abstract void fillQualsFromPileup(final List alleles, + protected abstract void fillQualsFromPileup(final List alleles, final int refLoc, final ReadBackedPileup readBackedPileup, final PerReadAlleleLikelihoodMap alleleLikelihoodMap, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index a1b8bcfc8..95fadfd46 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -47,7 +47,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio readPos = getFinalReadPosition(p.getRead(),readPos); - if ( allAlleles.get(0).equals(Allele.create(p.getBase())) ) { + if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { refQuals.add((double)readPos); } else if ( allAlleles.contains(Allele.create(p.getBase()))) { altQuals.add((double)readPos); @@ -57,9 +57,18 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio return; } - for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { - int readPos = getOffsetFromClippedReadStart(el.getKey().getRead(), el.getKey().getOffset()); - readPos = getFinalReadPosition(el.getKey().getRead(),readPos); + for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); + final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); + if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) + continue; + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); + final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); + +// int readPos = getOffsetFromClippedReadStart(el.getKey(), el.getKey().getOffset()); + // readPos = getFinalReadPosition(el.getKey().getRead(),readPos); final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); if (a.isNoCall()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java index a704afba9..9c0062876 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PerReadAlleleLikelihoodMap.java @@ -37,21 +37,21 @@ public class PerReadAlleleLikelihoodMap { public static final double INDEL_LIKELIHOOD_THRESH = 0.1; private List alleles; - private Map> likelihoodReadMap; + private Map> likelihoodReadMap; public PerReadAlleleLikelihoodMap() { - likelihoodReadMap = new LinkedHashMap>(); + likelihoodReadMap = new LinkedHashMap>(); alleles = new ArrayList(); } - public void add(PileupElement p, Allele a, Double likelihood) { + public void add(GATKSAMRecord read, Allele a, Double likelihood) { Map likelihoodMap; - if (likelihoodReadMap.containsKey(p)){ + if (likelihoodReadMap.containsKey(read)){ // seen pileup element before - likelihoodMap = likelihoodReadMap.get(p); + likelihoodMap = likelihoodReadMap.get(read); } else { likelihoodMap = new HashMap(); - likelihoodReadMap.put(p,likelihoodMap); + likelihoodReadMap.put(read,likelihoodMap); } likelihoodMap.put(a,likelihood); @@ -64,20 +64,19 @@ public class PerReadAlleleLikelihoodMap { return likelihoodReadMap.size(); } - public void add(GATKSAMRecord read, Allele a, Double likelihood) { - PileupElement p = new PileupElement(read,-1,false,false,false,false,false,false); - add(p,a,likelihood); + public void add(PileupElement p, Allele a, Double likelihood) { + add(p.getRead(),a,likelihood); } public boolean containsPileupElement(PileupElement p) { - return likelihoodReadMap.containsKey(p); + return likelihoodReadMap.containsKey(p.getRead()); } public boolean isEmpty() { return likelihoodReadMap.isEmpty(); } - public Map> getLikelihoodReadMap() { + public Map> getLikelihoodReadMap() { return likelihoodReadMap; } public void clear() { @@ -85,9 +84,17 @@ public class PerReadAlleleLikelihoodMap { likelihoodReadMap.clear(); } - public Set getStoredPileupElements() { + public Set getStoredElements() { return likelihoodReadMap.keySet(); } + + public Collection> getLikelihoodMapValues() { + return likelihoodReadMap.values(); + } + + public int getNumberOfStoredElements() { + return likelihoodReadMap.size(); + } /** * Returns list of reads greedily associated with a particular allele. * Needs to loop for each read, and assign to each allele @@ -100,10 +107,10 @@ public class PerReadAlleleLikelihoodMap { } public Map getLikelihoodsAssociatedWithPileupElement(PileupElement p) { - if (!likelihoodReadMap.containsKey(p)) + if (!likelihoodReadMap.containsKey(p.getRead())) return null; - return likelihoodReadMap.get(p); + return likelihoodReadMap.get(p.getRead()); } public static Allele getMostLikelyAllele(Map alleleMap) { From 154f65e0dec1e75fc318b501014e65b2f833a73c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 20 Aug 2012 12:43:17 -0400 Subject: [PATCH 03/32] Temporarily disabling multi-threaded usage of BaseRecalibrator for performance reasons. --- .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java | 9 +++++---- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index bd75806dd..580667ee2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -75,10 +75,11 @@ public class BQSRIntegrationTest extends WalkerTest { Arrays.asList(params.md5)); executeTest("testBQSR-"+params.args, spec).getFirst(); - WalkerTestSpec specNT2 = new WalkerTestSpec( - params.getCommandLine() + " -nt 2", - Arrays.asList(params.md5)); - executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); + // TODO -- re-enable once parallelization is fixed in BaseRecalibrator + //WalkerTestSpec specNT2 = new WalkerTestSpec( + // params.getCommandLine() + " -nt 2", + // Arrays.asList(params.md5)); + //executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index e45cad971..91d982f20 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -136,6 +136,10 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed */ public void initialize() { + // TODO -- remove me after the 2.1 release + if ( getToolkit().getArguments().numberOfThreads > 1 ) + throw new UserException("We have temporarily disabled the ability to run BaseRecalibrator multi-threaded for performance reasons. We hope to have this fixed for the next GATK release (2.2) and apologize for the inconvenience."); + // check for unsupported access if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals) throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument"); From 5b5fee56cfc936e9dcad3bd31dc50d28eb31883f Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 20 Aug 2012 12:52:15 -0400 Subject: [PATCH 04/32] Next iteration of new VA interface: extend changes to per-genotype annotations as well. Will allow to have AD correctly implemented at last (that change not done yet) --- .../walkers/annotator/AlleleBalanceBySample.java | 10 +++++++++- .../walkers/annotator/DepthPerAlleleBySample.java | 4 +++- .../annotator/MappingQualityZeroBySample.java | 14 ++++++++++---- .../walkers/annotator/VariantAnnotatorEngine.java | 15 ++++++++++----- .../annotator/interfaces/GenotypeAnnotation.java | 12 +++++++++--- 5 files changed, 41 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 11c9c3a99..0104f24d9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -24,7 +25,14 @@ import java.util.List; */ public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { - public void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, final GenotypeBuilder gb) { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index cd8faf093..8922bf54a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; @@ -48,7 +49,8 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final AlignmentContext stratifiedContext, final VariantContext vc, final Genotype g, - final GenotypeBuilder gb) { + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { if ( g == null || !g.isCalled() ) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index b5252f15b..354b798bb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -46,14 +47,19 @@ import java.util.List; * Count for each sample of mapping quality zero reads */ public class MappingQualityZeroBySample extends GenotypeAnnotation { - public void annotate(RefMetaDataTracker tracker, - AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext context, - VariantContext vc, Genotype g, GenotypeBuilder gb) { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ if ( g == null || !g.isCalled() ) return; int mq0 = 0; - final ReadBackedPileup pileup = context.getBasePileup(); + final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 ) mq0++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index f30fb4109..fd7968747 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -210,7 +210,7 @@ public class VariantAnnotatorEngine { VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process - return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); + return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); } public VariantContext annotateContext(final Map stratifiedContexts, VariantContext vc) { @@ -278,20 +278,25 @@ public class VariantAnnotatorEngine { } } - private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + private GenotypesContext annotateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext ref, final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( requestedGenotypeAnnotations.isEmpty() ) return vc.getGenotypes(); final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( context == null ) { + if ( context == null && perReadAlleleLikelihoodMap == null) { + // no likelihoods nor pileup available: just move on to next sample genotypes.add(genotype); } else { final GenotypeBuilder gb = new GenotypeBuilder(genotype); for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.annotate(tracker, walker, ref, context, vc, genotype, gb); + annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); } genotypes.add(gb.make()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java index bc20f6c97..6c55c1c00 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -13,9 +14,14 @@ import java.util.List; public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts/genotype split by sample - public abstract void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, - ReferenceContext ref, AlignmentContext stratifiedContext, - VariantContext vc, Genotype g, GenotypeBuilder gb ); + public abstract void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap); // return the descriptions used for the VCF FORMAT meta field public abstract List getDescriptions(); From c67d708c5173ea0f960e60f381b962b046faa12d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 20 Aug 2012 13:41:08 -0400 Subject: [PATCH 05/32] Bug fix in HaplotypeCaller for non-regular bases in the reference or reads. Those events don't get created any more. Bug fix for advanced GenotypeFullActiveRegion mode: custom variant annotations created by the HC don't make sense when in this mode so don't try to calculate them. --- .../haplotypecaller/GenotypingEngine.java | 47 ++++++++++---- .../haplotypecaller/HaplotypeCaller.java | 61 ++++++++++--------- .../LikelihoodCalculationEngine.java | 1 - .../genotyper/UnifiedArgumentCollection.java | 1 - .../broadinstitute/sting/utils/BaseUtils.java | 11 +++- 5 files changed, 76 insertions(+), 45 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 6afdc58ea..c56cf5bf2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -56,8 +56,12 @@ public class GenotypingEngine { // This function is the streamlined approach, currently not being used @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList haplotypes, final byte[] ref, final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) { + public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, + final ArrayList haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser ) { // Prepare the list of haplotype indices to genotype final ArrayList allelesToGenotype = new ArrayList(); @@ -224,7 +228,6 @@ public class GenotypingEngine { } } - // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { @@ -533,24 +536,36 @@ public class GenotypingEngine { final int elementLength = ce.getLength(); switch( ce.getOperator() ) { case I: + { final ArrayList insertionAlleles = new ArrayList(); final int insertionStart = refLoc.getStart() + refPos - 1; - insertionAlleles.add( Allele.create(ref[refPos-1], true) ); + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) ) { + insertionAlleles.add( Allele.create(refByte, true) ); + } if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) { insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); } else { byte[] insertionBases = new byte[]{}; insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength )); - insertionAlleles.add( Allele.create(insertionBases, false) ); + if( BaseUtils.isAllRegularBases(insertionBases) ) { + insertionAlleles.add( Allele.create(insertionBases, false) ); + } + } + if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele + vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); } - vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); alignmentPos += elementLength; break; + } case S: + { alignmentPos += elementLength; break; + } case D: + { final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base final ArrayList deletionAlleles = new ArrayList(); final int deletionStart = refLoc.getStart() + refPos - 1; @@ -561,15 +576,20 @@ public class GenotypingEngine { // deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); // vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make()); //} else { + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { deletionAlleles.add( Allele.create(deletionBases, true) ); - deletionAlleles.add( Allele.create(ref[refPos-1], false) ); + deletionAlleles.add( Allele.create(refByte, false) ); vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + } //} refPos += elementLength; break; + } case M: case EQ: case X: + { int numSinceMismatch = -1; int stopOfMismatch = -1; int startOfMismatch = -1; @@ -592,11 +612,13 @@ public class GenotypingEngine { if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) { final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 ); final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 ); - final ArrayList snpAlleles = new ArrayList(); - snpAlleles.add( Allele.create( refBases, true ) ); - snpAlleles.add( Allele.create( mismatchBases, false ) ); - final int snpStart = refLoc.getStart() + refPosStartOfMismatch; - vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make()); + if( BaseUtils.isAllRegularBases(refBases) && BaseUtils.isAllRegularBases(mismatchBases) ) { + final ArrayList snpAlleles = new ArrayList(); + snpAlleles.add( Allele.create( refBases, true ) ); + snpAlleles.add( Allele.create( mismatchBases, false ) ); + final int snpStart = refLoc.getStart() + refPosStartOfMismatch; + vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make()); + } numSinceMismatch = -1; stopOfMismatch = -1; startOfMismatch = -1; @@ -606,6 +628,7 @@ public class GenotypingEngine { alignmentPos++; } break; + } case N: case H: case P: diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index ec4fb3950..502711bb3 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -413,45 +413,48 @@ public class HaplotypeCaller extends ActiveRegionWalker implem for( final Pair>> callResult : ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() ) + ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() ) : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } final Map>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); - - // add some custom annotations to the calls final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - // Calculate the number of variants on the haplotype - int maxNumVar = 0; - for( final Allele allele : callResult.getFirst().getAlleles() ) { - if( !allele.isReference() ) { - for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { - final int numVar = haplotype.getEventMap().size(); - if( numVar > maxNumVar ) { maxNumVar = numVar; } + + if( !GENOTYPE_FULL_ACTIVE_REGION ) { + // add some custom annotations to the calls + + // Calculate the number of variants on the haplotype + int maxNumVar = 0; + for( final Allele allele : callResult.getFirst().getAlleles() ) { + if( !allele.isReference() ) { + for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { + final int numVar = haplotype.getEventMap().size(); + if( numVar > maxNumVar ) { maxNumVar = numVar; } + } } } - } - // Calculate the event length - int maxLength = 0; - for ( final Allele a : annotatedCall.getAlternateAlleles() ) { - final int length = a.length() - annotatedCall.getReference().length(); - if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } - } + // Calculate the event length + int maxLength = 0; + for ( final Allele a : annotatedCall.getAlternateAlleles() ) { + final int length = a.length() - annotatedCall.getReference().length(); + if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } + } - myAttributes.put("NVH", maxNumVar); - myAttributes.put("NumHapEval", bestHaplotypes.size()); - myAttributes.put("NumHapAssembly", haplotypes.size()); - myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); - myAttributes.put("EVENTLENGTH", maxLength); - myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); - myAttributes.put("extType", annotatedCall.getType().toString() ); + myAttributes.put("NVH", maxNumVar); + myAttributes.put("NumHapEval", bestHaplotypes.size()); + myAttributes.put("NumHapAssembly", haplotypes.size()); + myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); + myAttributes.put("EVENTLENGTH", maxLength); + myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); + myAttributes.put("extType", annotatedCall.getType().toString() ); - //if( likelihoodCalculationEngine.haplotypeScore != null ) { - // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); - //} - if( annotatedCall.hasAttribute("QD") ) { - myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); + //if( likelihoodCalculationEngine.haplotypeScore != null ) { + // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); + //} + if( annotatedCall.hasAttribute("QD") ) { + myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); + } } vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index fabf5633f..b5ce4b4bc 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -179,7 +179,6 @@ public class LikelihoodCalculationEngine { final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample); for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - // log10(10^(a*x1) + 10^(b*x2)) ??? // First term is approximated by Jacobian log with table lookup. haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index e755a1e36..f11f1e599 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -181,7 +181,6 @@ public class UnifiedArgumentCollection { Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only sample-level error model */ - @Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false) public boolean IGNORE_LANE_INFO = false; diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 13571df78..2d7f51c3f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -227,14 +227,21 @@ public class BaseUtils { } @Deprecated - static public boolean isRegularBase(char base) { + static public boolean isRegularBase( final char base ) { return simpleBaseToBaseIndex(base) != -1; } - static public boolean isRegularBase(byte base) { + static public boolean isRegularBase( final byte base ) { return simpleBaseToBaseIndex(base) != -1; } + static public boolean isAllRegularBases( final byte[] bases ) { + for( final byte base : bases) { + if( !isRegularBase(base) ) { return false; } + } + return true; + } + static public boolean isNBase(byte base) { return base == 'N' || base == 'n'; } From 4450d66c647f949da89e9590d2c1e336e82da742 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 20 Aug 2012 15:10:24 -0400 Subject: [PATCH 06/32] Fixing the docs for DP and AD --- .../gatk/walkers/annotator/DepthOfCoverage.java | 17 ++++------------- .../annotator/DepthPerAlleleBySample.java | 12 ++++++------ 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 28ca77f18..39b5e84dc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -22,19 +22,10 @@ import java.util.Map; /** * Total (unfiltered) depth over all samples. * - * This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample - * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal - * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site. - * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the - * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the - * power I have to determine the genotype of the sample at this site, while the AD tells me how many times - * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering - * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like - * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would - * normally be excluded from the statistical calculations going into GQ and QUAL. - * - * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with - * -dcov D is N * D + * While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's + * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth + * over all samples. Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for + * N samples with -dcov D is N * D */ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index a9edab752..5d83ddd51 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -24,10 +24,10 @@ import java.util.List; /** * The depth of coverage of each VCF allele in this sample. * - * This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample - * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal - * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site. - * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the + * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this + * sample at this site. While the sample-level (FORMAT) DP field describes the total depth of reads that passed the + * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of + * REF and ALT fields) is the unfiltered count of all reads that carried with them the * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the * power I have to determine the genotype of the sample at this site, while the AD tells me how many times * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering @@ -35,10 +35,10 @@ import java.util.List; * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that - * are actually present and correctly left-aligned in the alignments themselves). Because of this fact and + * are unambiguously informative about the alternate allele). Because of this fact and * because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base * assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what - * determine the genotype calls (see below). + * determine the genotype calls. */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { From 464d49509a7f3acba38b89b64b7fed8e04972365 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 20 Aug 2012 15:28:39 -0400 Subject: [PATCH 07/32] Pulling out common caller arguments into its own StandardCallerArgumentCollection base class so that every caller isn't exposed to the unused arguments from every other caller. --- .../haplotypecaller/HaplotypeCaller.java | 5 +- .../StandardCallerArgumentCollection.java | 62 +++++++++++++++++++ .../genotyper/UnifiedArgumentCollection.java | 58 ++++------------- 3 files changed, 78 insertions(+), 47 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 502711bb3..4f434bba6 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.commandline.*; @@ -189,7 +190,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected String[] annotationClassesToUse = { "Standard" }; @ArgumentCollection - private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); // the calculation arguments private UnifiedGenotyperEngine UG_engine = null; @@ -240,7 +241,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model - UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; // the GLmodel isn't used by the HaplotypeCaller but it is dangerous to let the user change this argument + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java new file mode 100644 index 000000000..f30fc0316 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.gatk.arguments; + +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 8/20/12 + * A collection of arguments that are common to the various callers. + * This is pulled out so that every caller isn't exposed to the arguments from every other caller. + */ + +public class StandardCallerArgumentCollection { + /** + * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: + * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 + */ + @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) + public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; + + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) + public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) + public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + + /** + * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with + * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this + * is the default). + */ + @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) + public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; + + /** + * This argument allows you to emit low quality calls as filtered records. + */ + @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) + public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; + + /** + * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding + */ + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) + public RodBinding alleles; + + /** + * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), + * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it + * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend + * that you not play around with this parameter. + */ + @Advanced + @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) + public int MAX_ALTERNATE_ALLELES = 3; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index f11f1e599..30c0f3e18 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -26,11 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -public class UnifiedArgumentCollection { +public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; @@ -42,13 +43,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; - /** - * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are: - * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2 - */ - @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) - public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY; - /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it @@ -57,26 +51,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false) public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE; - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) - public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; - - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) - public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; - - /** - * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with - * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this - * is the default). - */ - @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false) - public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0; - - /** - * This argument allows you to emit low quality calls as filtered records. - */ - @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false) - public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0; - /** * Note that calculating the SLOD increases the runtime by an appreciable amount. */ @@ -90,12 +64,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; - /** - * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding - */ - @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) - public RodBinding alleles; - /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -107,16 +75,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - /** - * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES), - * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it - * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend - * that you not play around with this parameter. - */ - @Advanced - @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) - public int MAX_ALTERNATE_ALLELES = 3; - @Hidden @Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false) public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false; @@ -139,7 +97,6 @@ public class UnifiedArgumentCollection { @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false) public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25; - /** * This argument informs the prior probability of having an indel at a site. */ @@ -274,5 +231,16 @@ public class UnifiedArgumentCollection { return uac; } + public UnifiedArgumentCollection() { } + public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { + super(); + this.alleles = SCAC.alleles; + this.GenotypingMode = SCAC.GenotypingMode; + this.heterozygosity = SCAC.heterozygosity; + this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; + this.OutputMode = SCAC.OutputMode; + this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; + this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + } } From a9472c198061f5127225e8e632a9a97dd5c40985 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 20 Aug 2012 16:11:45 -0400 Subject: [PATCH 08/32] Another round of FindBugs fixes. Inefficient use of keySet iterator instead of entrySet iterator. --- .../walkers/haplotypecaller/LikelihoodCalculationEngine.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index b5ce4b4bc..a69e39401 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -77,10 +77,10 @@ public class LikelihoodCalculationEngine { PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); // for each sample's reads - for( final String sample : perSampleReadList.keySet() ) { + for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes - computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray ); + computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray ); } } From 5e28bca63023f8183cffaaf259e47e3fa0c7be48 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 20 Aug 2012 16:15:48 -0400 Subject: [PATCH 09/32] Another round of FindBugs fixes. Should be static inner class. --- .../java/src/org/broadinstitute/sting/utils/MannWhitneyU.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index d1bc75583..99f43876c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -434,7 +434,7 @@ public class MannWhitneyU { * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values * and to ensure that rank ties are broken at random. */ - private class DitheringComparator implements Comparator> { + private static class DitheringComparator implements Comparator> { public DitheringComparator() {} From 77fbaec04406afb7f621a1eb8199a3936ec7056f Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 20 Aug 2012 16:55:00 -0400 Subject: [PATCH 10/32] Another round of FindBugs fixes. Class implements its own compareTo() but uses base Object.equals() which can lead to unpredictable behavior. --- .../walkers/haplotypecaller/DeBruijnEdge.java | 12 +++++++---- .../walkers/haplotypecaller/KBestPaths.java | 4 +++- .../traversals/TraverseActiveRegions.java | 11 +++++----- .../walkers/variantrecalibration/Tranche.java | 14 ++++++++----- .../variantrecalibration/TrancheManager.java | 2 +- .../VariantDataManager.java | 2 +- .../variantrecalibration/VariantDatum.java | 12 +++++++---- .../utils/activeregion/ActiveRegion.java | 21 ++++++++++++++----- 8 files changed, 52 insertions(+), 26 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java index 0890ac20c..39bb3617f 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.jgrapht.graph.DefaultDirectedGraph; +import java.util.Comparator; + /** * Created by IntelliJ IDEA. * User: ebanks @@ -9,7 +11,7 @@ import org.jgrapht.graph.DefaultDirectedGraph; */ // simple edge class for connecting nodes in the graph -public class DeBruijnEdge implements Comparable { +public class DeBruijnEdge { private int multiplicity; private boolean isRef; @@ -53,8 +55,10 @@ public class DeBruijnEdge implements Comparable { return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); } - @Override - public int compareTo( final DeBruijnEdge that ) { - return this.multiplicity - that.multiplicity; + public static class EdgeWeightComparator implements Comparator { + @Override + public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) { + return edge1.multiplicity - edge2.multiplicity; + } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index 0ef1a13a4..4a5a906f2 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -77,12 +77,14 @@ public class KBestPaths { } protected static class PathComparatorTotalScore implements Comparator { + @Override public int compare(final Path path1, final Path path2) { return path1.totalScore - path2.totalScore; } } //protected static class PathComparatorLowestEdge implements Comparator { + // @Override // public int compare(final Path path1, final Path path2) { // return path2.lowestEdge - path1.lowestEdge; // } @@ -124,7 +126,7 @@ public class KBestPaths { // recursively run DFS final ArrayList edgeArrayList = new ArrayList(); edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex)); - Collections.sort(edgeArrayList); + Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator()); Collections.reverse(edgeArrayList); for ( final DeBruijnEdge edge : edgeArrayList ) { // make sure the edge is not already in the path diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 979e0f2d6..67de427e8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -31,7 +32,7 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); @Override @@ -110,18 +111,18 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if( !activeRegions.isEmpty() ) { if( !workQueue.isEmpty() ) { - final org.broadinstitute.sting.utils.activeregion.ActiveRegion last = workQueue.getLast(); - final org.broadinstitute.sting.utils.activeregion.ActiveRegion first = activeRegions.get(0); + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { workQueue.removeLast(); activeRegions.remove(first); - workQueue.add( new org.broadinstitute.sting.utils.activeregion.ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); } } workQueue.addAll( activeRegions ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java index 9228dc375..14ef63264 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -41,7 +41,7 @@ import java.util.*; * Date: Mar 10, 2011 */ -public class Tranche implements Comparable { +public class Tranche { private static final int CURRENT_VERSION = 5; public double ts, minVQSLod, knownTiTv, novelTiTv; @@ -83,10 +83,14 @@ public class Tranche implements Comparable { return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; } - public int compareTo(Tranche other) { - return Double.compare(this.ts, other.ts); + public static class TrancheTruthSensitivityComparator implements Comparator { + @Override + public int compare(final Tranche tranche1, final Tranche tranche2) { + return Double.compare(tranche1.ts, tranche2.ts); + } } + @Override public String toString() { return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.4f) novel=(%d @ %.4f) truthSites(%d accessible, %d called), name=%s]", ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); @@ -102,7 +106,7 @@ public class Tranche implements Comparable { final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); final PrintStream stream = new PrintStream(bytes); - Collections.sort(tranches); + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); stream.println("# Variant quality score tranches file"); stream.println("# Version number " + CURRENT_VERSION); @@ -183,7 +187,7 @@ public class Tranche implements Comparable { } } - Collections.sort(tranches); + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); return tranches; } catch( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(f, e); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java index af0778399..58b4e4fc7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java @@ -146,7 +146,7 @@ public class TrancheManager { public static List findTranches( final ArrayList data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) { logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size())); - Collections.sort(data); + Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); metric.calculateRunningMetric(data); if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e88505f99..ba31d53cb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -158,7 +158,7 @@ public class VariantDataManager { logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." ); // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants - Collections.sort( data ); + Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) ); if( numToAdd > data.size() ) { throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --minNumBadVariants arugment but this is unsafe." ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index a85129d78..55052f73b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -27,13 +27,15 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.utils.GenomeLoc; +import java.util.Comparator; + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Mar 4, 2011 */ -public class VariantDatum implements Comparable { +public class VariantDatum { public double[] annotations; public boolean[] isNull; @@ -52,8 +54,10 @@ public class VariantDatum implements Comparable { public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation - @Override - public int compareTo( final VariantDatum other ) { - return Double.compare(this.lod, other.lod); + public static class VariantDatumLODComparator implements Comparator { + @Override + public int compare(final VariantDatum datum1, final VariantDatum datum2) { + return Double.compare(datum1.lod, datum2.lod); + } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 8e660350f..6756c1c02 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -15,7 +15,7 @@ import java.util.ArrayList; * Date: 1/4/12 */ -public class ActiveRegion implements HasGenomeLocation, Comparable { +public class ActiveRegion implements HasGenomeLocation { private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; @@ -73,10 +73,6 @@ public class ActiveRegion implements HasGenomeLocation, Comparable Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } - @Override - public int compareTo( final ActiveRegion other ) { - return this.getLocation().compareTo(other.getLocation()); - } @Override public GenomeLoc getLocation() { return activeRegionLoc; } @@ -97,4 +93,19 @@ public class ActiveRegion implements HasGenomeLocation, Comparable if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; return true; } + + /** + * A comparator class which is used to sort ActiveRegions by their start location + */ + /* + public static class ActiveRegionStartLocationComparator implements Comparator { + + public ActiveRegionStartLocationComparator() {} + + @Override + public int compare(final ActiveRegion left, final ActiveRegion right) { + return left.getLocation().compareTo(right.getLocation()); + } + } + */ } \ No newline at end of file From 2041cb853cafaab185c44b101e131d2a92a5dd2a Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 20 Aug 2012 20:31:34 -0400 Subject: [PATCH 11/32] New implementation of AD - ignore now non-informative reads based on per-read likelihoods --- .../annotator/DepthPerAlleleBySample.java | 54 +++++++------------ 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 8922bf54a..80c10fa5f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -20,6 +21,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; /** @@ -35,8 +37,9 @@ import java.util.List; * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that - * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that - * are actually present and correctly left-aligned in the alignments themselves). Because of this fact and + * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. + * Because of this fact, the sum of AD may be much lower than the individual sample depth, especially when there are + * many non-informatice reads. * because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base * assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what * determine the genotype calls (see below). @@ -54,13 +57,13 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa if ( g == null || !g.isCalled() ) return; - if ( vc.isSNP() ) - annotateSNP(stratifiedContext, vc, gb); - else if ( vc.isIndel() ) - annotateIndel(stratifiedContext, ref.getBase(), vc, gb); + if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) + annotateWithLikelihoods(alleleLikelihoodMap, ref.getBase(), vc, gb); + else if ( vc.isSNP() && stratifiedContext != null) + annotateWithPileup(stratifiedContext, vc, gb); } - private void annotateSNP(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { + private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { HashMap alleleCounts = new HashMap(); for ( Allele allele : vc.getAlleles() ) @@ -81,48 +84,29 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa gb.AD(counts); } - private void annotateIndel(final AlignmentContext stratifiedContext, final byte refBase, final VariantContext vc, final GenotypeBuilder gb) { - ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - if ( pileup == null ) - return; - + private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final byte refBase, final VariantContext vc, final GenotypeBuilder gb) { final HashMap alleleCounts = new HashMap(); - final Allele refAllele = vc.getReference(); for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } + for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (!vc.getAlleles().contains(a)) + continue; // sanity check - shouldn't be needed + alleleCounts.put(a,alleleCounts.get(a)+1); - for ( PileupElement p : pileup ) { - if ( p.isBeforeInsertion() ) { - - final Allele insertion = Allele.create((char)refBase + p.getEventBases(), false); - if ( alleleCounts.containsKey(insertion) ) { - alleleCounts.put(insertion, alleleCounts.get(insertion)+1); - } - - } else if ( p.isBeforeDeletionStart() ) { - if ( p.getEventLength() == refAllele.length() - 1 ) { - // this is indeed the deletion allele recorded in VC - final Allele deletion = Allele.create(refBase); - if ( alleleCounts.containsKey(deletion) ) { - alleleCounts.put(deletion, alleleCounts.get(deletion)+1); - } - } - } else if ( p.getRead().getAlignmentEnd() > vc.getStart() ) { - alleleCounts.put(refAllele, alleleCounts.get(refAllele)+1); - } } - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(refAllele); + counts[0] = alleleCounts.get(vc.getReference()); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); gb.AD(counts); } - // public String getIndelBases() public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } public List getDescriptions() { From 286b658fab8bd062e70bfa3158694dc212fb602b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 20 Aug 2012 21:25:14 -0400 Subject: [PATCH 12/32] Re-enabling parallelism in the BaseRecalibrator now that the release is out. --- .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java | 9 ++++----- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 4 ---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 580667ee2..bd75806dd 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -75,11 +75,10 @@ public class BQSRIntegrationTest extends WalkerTest { Arrays.asList(params.md5)); executeTest("testBQSR-"+params.args, spec).getFirst(); - // TODO -- re-enable once parallelization is fixed in BaseRecalibrator - //WalkerTestSpec specNT2 = new WalkerTestSpec( - // params.getCommandLine() + " -nt 2", - // Arrays.asList(params.md5)); - //executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); + WalkerTestSpec specNT2 = new WalkerTestSpec( + params.getCommandLine() + " -nt 2", + Arrays.asList(params.md5)); + executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst(); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 91d982f20..e45cad971 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -136,10 +136,6 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed */ public void initialize() { - // TODO -- remove me after the 2.1 release - if ( getToolkit().getArguments().numberOfThreads > 1 ) - throw new UserException("We have temporarily disabled the ability to run BaseRecalibrator multi-threaded for performance reasons. We hope to have this fixed for the next GATK release (2.2) and apologize for the inconvenience."); - // check for unsupported access if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals) throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument"); From 3514fb6e6620a0e289837a8fb6c216de9d14c608 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Mon, 20 Aug 2012 21:39:38 -0400 Subject: [PATCH 13/32] Changed the default memory limit from none to 2GB upon suggestions from delangel, carneiro, and depristo. --- .../scala/src/org/broadinstitute/sting/queue/QSettings.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 1a50301f1..429428c4c 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -52,8 +52,8 @@ class QSettings { @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) var jobEnvironmentNames: Seq[String] = Nil - @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) - var memoryLimit: Option[Double] = None + @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) + var memoryLimit: Option[Double] = Some(2) @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) var memoryLimitThreshold: Option[Double] = None From 40d5efc8043d2fc93642f35d6731a7b297858ccf Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 20 Aug 2012 23:12:41 -0400 Subject: [PATCH 14/32] Fix for Adam K's reported bug: we weren't handling reads that were entirely insertions properly in LIBS. Specifically, the event bases were off-by-one (which was disasterous in Adam's case with a 1bp read). Added a unit test to cover this case. --- .../gatk/iterators/LocusIteratorByState.java | 9 ++-- .../LocusIteratorByStateUnitTest.java | 45 +++++++++++++++---- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index f97069189..b5d5dfc0e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -301,6 +301,7 @@ public class LocusIteratorByState extends LocusIterator { final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final boolean isSingleElementCigar = nextElement == lastElement; final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator final int readOffset = state.getReadOffset(); // the base offset on this read @@ -308,7 +309,7 @@ public class LocusIteratorByState extends LocusIterator { final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; - final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); int nextElementLength = nextElement.getLength(); @@ -328,8 +329,10 @@ public class LocusIteratorByState extends LocusIterator { else { if (!filterBaseInRead(read, location.getStart())) { String insertedBaseString = null; - if (nextOp == CigarOperator.I) - insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength())); + if (nextOp == CigarOperator.I) { + final int insertionOffset = isSingleElementCigar ? 0 : 1; + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); + } pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); size++; diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index dc908c323..e65a7fb27 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -104,7 +104,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); after.setCigarString("10M"); - List reads = Arrays.asList(before,during,after); + List reads = Arrays.asList(before, during, after); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads,readAttributes); @@ -134,9 +134,9 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // create a test version of the Reads object ReadProperties readAttributes = createTestReadProperties(); - SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,firstLocus,76); + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); indelOnlyRead.setCigarString("76I"); List reads = Arrays.asList(indelOnlyRead); @@ -148,10 +148,10 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // and considers it to be an indel-containing read. Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),firstLocus,"Base pileup is at incorrect location."); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); ReadBackedPileup basePileup = alignmentContext.getBasePileup(); Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); - Assert.assertSame(basePileup.getReads().get(0),indelOnlyRead,"Read in pileup is incorrect"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); } /** @@ -168,7 +168,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { leadingRead.setCigarString("1M75I"); SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); indelOnlyRead.setCigarString("76I"); @@ -177,7 +177,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); fullMatchAfterIndel.setCigarString("75I1M"); - List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, createTestReadProperties()); @@ -204,7 +204,36 @@ public class LocusIteratorByStateUnitTest extends BaseTest { numAlignmentContextsFound++; } - Assert.assertEquals(numAlignmentContextsFound,2,"Found incorrect number of alignment contexts"); + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test + public void testWholeIndelReadTest2() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read",0,secondLocus,1); + read.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read.setCigarString("1I"); + + List reads = Arrays.asList(read); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } } private static ReadProperties createTestReadProperties() { From ba8622ff0d30298dce4418760ed4a434fa1bda02 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 21 Aug 2012 07:03:50 -0400 Subject: [PATCH 15/32] number of stashed changes are lurking in here. In order of importance: - Fix for M_Trieb's error report on the forum, and addition of integration tests to cover the walker. - Addition of StructuralIndel as a class of variation within the VariantContext. These are for variants with a full alt allele that's >150bp in length. - Adaptation of the MVLikelihoodRatio to work for a set of trios (takes the max over the trios of the MVLR) - InsertSizeDistribution changed to use the new gatk report output (it was previously broken) - RetrogeneDiscovery changed to be compatible with the new gatk report - A maxIndelSize argument added to SelectVariants - ByTranscriptEvaluator rewritten for cleanliness - VariantRecalibrator modified to not exclude structural indels from recalibration if the mode is INDEL - Documentation added to DepthOfCoverageIntegrationTest (no, don't yell at chartl ;_; ) Also sorry for the long commit history behind this that is the result of fixing merge conflicts. Because this *also* fixes a conflict (from git stash apply), for some reason I can't rebase all of them away. I'm pretty sure some of the commit notes say "this note isn't important because I'm going to rebase it anyway". --- .../walkers/annotator/MVLikelihoodRatio.java | 64 +++++---- .../VariantDataManager.java | 2 +- .../walkers/variantutils/SelectVariants.java | 20 +++ .../variantutils/VariantsToBinaryPed.java | 136 +++++++++++------- .../utils/variantcontext/VariantContext.java | 25 +++- .../DepthOfCoverageIntegrationTest.java | 2 +- .../VariantsToBinaryPedIntegrationTest.java | 92 ++++++++++++ 7 files changed, 260 insertions(+), 81 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b6f24433e..7b0db6855 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -28,9 +29,13 @@ import java.util.*; public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; - private String motherId; - private String fatherId; - private String childId; + private Set trios; + + private class Trio { + String motherId; + String fatherId; + String childId; + } public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( mendelianViolation == null ) { @@ -38,17 +43,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line."); } } Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && - vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && - vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); - if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); + //double pNoMV = 1.0; + double maxMVLR = Double.MIN_VALUE; + for ( Trio trio : trios ) { + boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() && + vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() && + vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods(); + if ( hasAppropriateGenotypes ) { + Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId); + maxMVLR = likR > maxMVLR ? likR : maxMVLR; + //pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR))); + } + } + //double pSomeMV = 1.0-pNoMV; + //toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV)); + toRet.put("MVLR",maxMVLR); return toRet; } @@ -58,25 +73,24 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } private boolean checkAndSetSamples(SampleDB db){ + trios = new HashSet(); Set families = db.getFamilyIDs(); - if(families.size() != 1) - return false; - - Set family = db.getFamily(families.iterator().next()); - if(family.size() != 3) - return false; - - Iterator sampleIter = family.iterator(); - Sample sample; - for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ - if(sample.getParents().size()==2){ - motherId = sample.getMaternalID(); - fatherId = sample.getPaternalID(); - childId = sample.getID(); - return true; + for ( String familyString : families ) { + Set family = db.getFamily(familyString); + Iterator sampleIterator = family.iterator(); + Sample sample; + for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) { + if ( sample.getParents().size() == 2 ) { + Trio trio = new Trio(); + trio.childId = sample.getID(); + trio.fatherId = sample.getFather().getID(); + trio.motherId = sample.getMother().getID(); + trios.add(trio); + } } } - return false; + + return trios.size() > 0; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e88505f99..1f06cc249 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -297,7 +297,7 @@ public class VariantDataManager { case SNP: return evalVC.isSNP() || evalVC.isMNP(); case INDEL: - return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); + return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); case BOTH: return true; default: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bfd9aa52f..4c0c0cabf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -322,6 +322,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false) private boolean justRead = false; + @Argument(doc="indel size select",required=false,fullName="maxIndelSize") + private int maxIndelSize = Integer.MAX_VALUE; + /* Private class used to store the intermediate variants in the integer random selection process */ private static class RandomVariantStructure { @@ -541,6 +544,9 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; + if ( badIndelSize(vc) ) + continue; + VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { @@ -572,6 +578,20 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } + private boolean badIndelSize(final VariantContext vc) { + if ( vc.getReference().length() > maxIndelSize ) { + return true; + } + + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.length() > maxIndelSize ) { + return true; + } + } + + return false; + } + private boolean hasPLs(final VariantContext vc) { for ( Genotype g : vc.getGenotypes() ) { if ( g.hasLikelihoods() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 3fba8fa77..7111bac46 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -76,47 +76,11 @@ public class VariantsToBinaryPed extends RodWalker { private List famOrder = new ArrayList(); public void initialize() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; + initializeValidator(); + writeBedHeader(); + Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers - // write magic bits into the ped file - try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); - // ultimately, the bed will be in individual-major mode - } catch (IOException e) { - throw new ReviewedStingException("error writing to output file."); - } - // write to the fam file, the first six columns of the standard ped file - // first, load data from the input meta data file - Map> metaValues = new HashMap>(); - logger.debug("Reading in metadata..."); - try { - if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { - for ( String line : new XReadLines(metaDataFile) ) { - String[] famSplit = line.split("\\t"); - String sid = famSplit[1]; - outFam.printf("%s%n",line); - } - } else { - for ( String line : new XReadLines(metaDataFile) ) { - logger.debug(line); - String[] split = line.split("\\t"); - String sampleID = split[0]; - String keyVals = split[1]; - HashMap values = new HashMap(); - for ( String kvp : keyVals.split(";") ) { - String[] kvp_split = kvp.split("="); - values.put(kvp_split[0],kvp_split[1]); - } - metaValues.put(sampleID,values); - } - } - } catch (FileNotFoundException e) { - throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); - } // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype int dummyID = 0; // increments for dummy parental and family IDs used // want to be especially careful to maintain order here @@ -126,21 +90,23 @@ public class VariantsToBinaryPed extends RodWalker { continue; } for ( String sample : header.getValue().getGenotypeSamples() ) { - Map mVals = metaValues.get(sample); - if ( mVals == null ) { - throw new UserException("No metadata provided for sample "+sample); + if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) { + Map mVals = sampleMetaValues.get(sample); + if ( mVals == null ) { + throw new UserException("No metadata provided for sample "+sample); + } + if ( ! mVals.containsKey("phenotype") ) { + throw new UserException("No phenotype data provided for sample "+sample); + } + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.get("phenotype"); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); } - if ( ! mVals.containsKey("phenotype") ) { - throw new UserException("No phenotype data provided for sample "+sample); - } - String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); - String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); - String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); - String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; - String pheno = mVals.get("phenotype"); - outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); try { - File temp = File.createTempFile(sample, ".tmp"); + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); printMap.put(sample,new PrintStream(temp)); tempFiles.put(sample,temp); } catch (IOException e) { @@ -216,6 +182,7 @@ public class VariantsToBinaryPed extends RodWalker { // reset the buffer for this sample genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } + byteCount = 0; } genotypeCount = 0; } @@ -337,4 +304,69 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); } } + + private void initializeValidator() { + vv.variantCollection = variantCollection; + vv.dbsnp = dbsnp; + vv.DO_NOT_VALIDATE_FILTERED = true; + vv.type = ValidateVariants.ValidationType.REF; + } + + private void writeBedHeader() { + // write magic bits into the ped file + try { + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + // ultimately, the bed will be in individual-major mode + } catch (IOException e) { + throw new ReviewedStingException("error writing to output file."); + } + } + + private Map> parseMetaData() { + // write to the fam file, the first six columns of the standard ped file + // first, load data from the input meta data file + Map> metaValues = new HashMap>(); + logger.debug("Reading in metadata..."); + try { + if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { + for ( String line : new XReadLines(metaDataFile) ) { + String[] famSplit = line.split("\\s+"); + if ( famSplit.length != 6 ) { + throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line); + } + String sid = famSplit[1]; + String fid = famSplit[0]; + String mom = famSplit[2]; + String dad = famSplit[3]; + String sex = famSplit[4]; + String pheno = famSplit[5]; + HashMap values = new HashMap(); + values.put("mom",mom); + values.put("dad",dad); + values.put("fid",fid); + values.put("sex",sex); + values.put("phenotype",pheno); + metaValues.put(sid,values); + outFam.printf("%s%n",line); + } + } else { + for ( String line : new XReadLines(metaDataFile) ) { + logger.debug(line); + String[] split = line.split("\\s+"); + String sampleID = split[0]; + String keyVals = split[1]; + HashMap values = new HashMap(); + for ( String kvp : keyVals.split(";") ) { + String[] kvp_split = kvp.split("="); + values.put(kvp_split[0],kvp_split[1]); + } + metaValues.put(sampleID,values); + } + } + } catch (FileNotFoundException e) { + throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); + } + + return metaValues; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1fe6b8652..8015889f5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.apache.commons.math.stat.descriptive.rank.Max; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; @@ -178,9 +179,8 @@ import java.util.*; */ public class VariantContext implements Feature { // to enable tribble integration private final static boolean WARN_ABOUT_BAD_END = true; + private final static long MAX_ALLELE_SIZE_FOR_NON_SV = 150; final protected static Logger logger = Logger.getLogger(VariantContext.class); - - private boolean fullyDecoded = false; protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; @@ -458,6 +458,7 @@ public class VariantContext implements Feature { // to enable tribble integratio SNP, MNP, // a multi-nucleotide polymorphism INDEL, + STRUCTURAL_INDEL, SYMBOLIC, MIXED, } @@ -530,6 +531,18 @@ public class VariantContext implements Feature { // to enable tribble integratio return getType() == Type.SYMBOLIC; } + public boolean isStructuralIndel() { + return getType() == Type.STRUCTURAL_INDEL; + } + + /** + * + * @return true if the variant is symbolic or a large indel + */ + public boolean isSymbolicOrSV() { + return isSymbolic() || isStructuralIndel(); + } + public boolean isMNP() { return getType() == Type.MNP; } @@ -1250,6 +1263,14 @@ public class VariantContext implements Feature { // to enable tribble integratio // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + + // Because a number of structural variation callers write the whole alternate allele into the VCF where possible, + // this can result in insertion/deletion alleles of structural variant size, e.g. 151+. As of July 2012, we now + // classify these as structural events, rather than indel events, as we think differently about the mechanism, + // representation, and handling of these events. Check for this case here: + if ( ref.length() > MAX_ALLELE_SIZE_FOR_NON_SV || allele.length() > MAX_ALLELE_SIZE_FOR_NON_SV ) + return Type.STRUCTURAL_INDEL; + return Type.INDEL; // old incorrect logic: diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 6f1370008..9bec1b75d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -9,7 +9,7 @@ import java.util.Arrays; import java.util.List; /** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * Integration tests for the Depth of Coverage walker * * @Author chartl * @Date Feb 25, 2010 diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java new file mode 100644 index 000000000..07e82b869 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -0,0 +1,92 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/20/12 + * Time: 9:57 PM + * To change this template use File | Settings | File Templates. + */ +public class VariantsToBinaryPedIntegrationTest extends WalkerTest { + + public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/"; + + public static String baseTestString(String inputVCF, String inputMetaData, int gq) { + return "-T VariantsToBinaryPed -R " + b37KGReference + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + + @Test + public void testNA12878Alone() { + String testName = "testNA12878Alone"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testNA12878AloneMetaData() { + String testName = "testNA12878AloneMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrio() { + String testName = "testCEUTrio"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrioMetaData() { + String testName = "testCEUTrioMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testMalformedFam() { + String testName = "testMalformedFam"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + } +} + + From 55b7949d68daf5f47d46c81227349b7d7982f844 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 21 Aug 2012 09:20:55 -0400 Subject: [PATCH 16/32] Another round of FindBugs fixes. Comparator doesn't implement Serializable. --- .../sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java | 3 ++- .../sting/gatk/walkers/haplotypecaller/KBestPaths.java | 5 +++-- .../sting/gatk/walkers/annotator/HaplotypeScore.java | 3 ++- .../sting/gatk/walkers/variantrecalibration/Tranche.java | 7 ++----- .../gatk/walkers/variantrecalibration/VariantDatum.java | 3 ++- .../src/org/broadinstitute/sting/utils/MannWhitneyU.java | 5 ++++- 6 files changed, 15 insertions(+), 11 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java index 39bb3617f..287acafb3 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import org.jgrapht.graph.DefaultDirectedGraph; +import java.io.Serializable; import java.util.Comparator; /** @@ -55,7 +56,7 @@ public class DeBruijnEdge { return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); } - public static class EdgeWeightComparator implements Comparator { + public static class EdgeWeightComparator implements Comparator, Serializable { @Override public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) { return edge1.multiplicity - edge2.multiplicity; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java index 4a5a906f2..f7575439b 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java @@ -4,6 +4,7 @@ import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.jgrapht.graph.DefaultDirectedGraph; +import java.io.Serializable; import java.util.*; /** @@ -76,14 +77,14 @@ public class KBestPaths { } } - protected static class PathComparatorTotalScore implements Comparator { + protected static class PathComparatorTotalScore implements Comparator, Serializable { @Override public int compare(final Path path1, final Path path2) { return path1.totalScore - path2.totalScore; } } - //protected static class PathComparatorLowestEdge implements Comparator { + //protected static class PathComparatorLowestEdge implements Comparator, Serializable { // @Override // public int compare(final Path path1, final Path path2) { // return path2.lowestEdge - path1.lowestEdge; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index c6d8883c5..27a6c068f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -47,6 +47,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; /** @@ -103,7 +104,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot return map; } - private static class HaplotypeComparator implements Comparator { + private static class HaplotypeComparator implements Comparator, Serializable { public int compare(Haplotype a, Haplotype b) { if (a.getQualitySum() < b.getQualitySum()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java index 14ef63264..042d4741d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -29,10 +29,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -83,7 +80,7 @@ public class Tranche { return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; } - public static class TrancheTruthSensitivityComparator implements Comparator { + public static class TrancheTruthSensitivityComparator implements Comparator, Serializable { @Override public int compare(final Tranche tranche1, final Tranche tranche2) { return Double.compare(tranche1.ts, tranche2.ts); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index 55052f73b..7b3b0d17d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.utils.GenomeLoc; +import java.io.Serializable; import java.util.Comparator; /** @@ -54,7 +55,7 @@ public class VariantDatum { public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation - public static class VariantDatumLODComparator implements Comparator { + public static class VariantDatumLODComparator implements Comparator, Serializable { @Override public int compare(final VariantDatum datum1, final VariantDatum datum2) { return Double.compare(datum1.lod, datum2.lod); diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index 99f43876c..8339e38c9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; +import java.io.Serializable; import java.util.Comparator; import java.util.TreeSet; @@ -434,12 +435,14 @@ public class MannWhitneyU { * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values * and to ensure that rank ties are broken at random. */ - private static class DitheringComparator implements Comparator> { + private static class DitheringComparator implements Comparator>, Serializable { public DitheringComparator() {} + @Override public boolean equals(Object other) { return false; } + @Override public int compare(Pair left, Pair right) { double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); if ( comp > 0 ) { return 1; } From 605acaae9cf0b9de4f55dcd7891c7243fb555bab Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 21 Aug 2012 09:33:58 -0400 Subject: [PATCH 17/32] Another round of FindBugs fixes. Object internally stores a reference to an externally mutable array. Very dangerous. --- .../walkers/haplotypecaller/DeBruijnVertex.java | 2 +- .../walkers/bqsr/StandardRecalibrationEngine.java | 2 +- .../org/broadinstitute/sting/utils/Haplotype.java | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index 39833613d..358bd0c38 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -14,7 +14,7 @@ public class DeBruijnVertex { public final int kmer; public DeBruijnVertex( final byte[] sequence, final int kmer ) { - this.sequence = sequence; + this.sequence = sequence.clone(); this.kmer = kmer; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 08c7da754..aec1bf7a8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -42,7 +42,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP protected RecalibrationTables recalibrationTables; public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { - this.covariates = covariates; + this.covariates = covariates.clone(); this.recalibrationTables = recalibrationTables; } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index fcde1f419..efddf1468 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -55,22 +55,22 @@ public class Haplotype { * @param bases bases * @param qual qual */ - public Haplotype(byte[] bases, int qual) { - this.bases = bases; + public Haplotype( final byte[] bases, final int qual ) { + this.bases = bases.clone(); quals = new double[bases.length]; Arrays.fill(quals, (double)qual); } - public Haplotype(byte[] bases, double[] quals) { - this.bases = bases; - this.quals = quals; + public Haplotype( final byte[] bases, final double[] quals ) { + this.bases = bases.clone(); + this.quals = quals.clone(); } - public Haplotype(byte[] bases) { + public Haplotype( final byte[] bases ) { this(bases, 0); } - public Haplotype(byte[] bases, GenomeLoc loc) { + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; } From 10961db3ce04c46ac1370f40ee9640824fc30aaa Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 21 Aug 2012 09:35:55 -0400 Subject: [PATCH 18/32] Another round of FindBugs fixes. Object returns its internal reference to an externally mutable array. Very dangerous. --- .../sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java | 2 +- public/java/src/org/broadinstitute/sting/utils/Haplotype.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java index 358bd0c38..4da3251bc 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java @@ -37,7 +37,7 @@ public class DeBruijnVertex { } public byte[] getSequence() { - return sequence; + return sequence.clone(); } public byte[] getSuffix() { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index efddf1468..befd24307 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -140,10 +140,10 @@ public class Haplotype { } public double[] getQuals() { - return quals; + return quals.clone(); } public byte[] getBases() { - return bases; + return bases.clone(); } public long getStartPosition() { From 6a8cf1c84a131a2fb8d9a6c9e0b5b9ef16799a67 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 21 Aug 2012 14:35:40 -0400 Subject: [PATCH 19/32] Enable and adapt HaplotypeScore and MappingQualityZero as active region annotations now that we have per-read likelihoods passed in to annotations --- .../haplotypecaller/HaplotypeCaller.java | 2 +- .../annotator/DepthPerAlleleBySample.java | 4 +- .../walkers/annotator/HaplotypeScore.java | 52 ++++++++++++++----- .../walkers/annotator/MappingQualityZero.java | 35 ++++++++++++- .../annotator/VariantAnnotatorEngine.java | 8 ++- .../ActiveRegionBasedAnnotation.java | 2 +- 6 files changed, 83 insertions(+), 20 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 01c3f0491..39c7551f0 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -178,7 +178,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem * so annotations will be excluded even if they are explicitly included with the other options. */ @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"})); + protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"})); /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 055656fe2..b3fc67c2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -58,7 +58,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa return; if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) - annotateWithLikelihoods(alleleLikelihoodMap, ref.getBase(), vc, gb); + annotateWithLikelihoods(alleleLikelihoodMap, vc, gb); else if ( vc.isSNP() && stratifiedContext != null) annotateWithPileup(stratifiedContext, vc, gb); } @@ -84,7 +84,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa gb.AD(counts); } - private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final byte refBase, final VariantContext vc, final GenotypeBuilder gb) { + private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) { final HashMap alleleCounts = new HashMap(); for ( final Allele allele : vc.getAlleles() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index eedb8cbd7..2bf060f12 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -28,6 +28,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -56,7 +57,7 @@ import java.util.*; * are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls. * Note that the Haplotype Score is only calculated for sites with read coverage. */ -public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation { +public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static boolean DEBUG = false; private final static int MIN_CONTEXT_WING_SIZE = 10; private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; @@ -68,10 +69,19 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here + if (vc.isSNP() && stratifiedContexts != null) + return annotatePileup(ref, stratifiedContexts, vc); + else if (stratifiedPerReadAlleleLikelihoodMap != null) + return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); + else return null; + } - if (!vc.isSNP() && !vc.isIndel() && !vc.isMixed()) + private Map annotatePileup(final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { + + if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here return null; final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); @@ -92,16 +102,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); if (thisContext != null) { final ReadBackedPileup thisPileup = thisContext.getBasePileup(); - if (vc.isSNP()) - scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - else if (vc.isIndel() || vc.isMixed()) { - if (stratifiedPerReadAlleleLikelihoodMap == null) - return null; - Double d = scoreIndelsAgainstHaplotypes(stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName())); - if (d == null) - return null; - scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - } + scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense } } } @@ -112,6 +113,31 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot return map; } + private Map annotateWithLikelihoods(final Map stratifiedPerReadAlleleLikelihoodMap, + final VariantContext vc) { + + final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); + for (final Genotype genotype : vc.getGenotypes()) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoodMap == null) + continue; + + Double d = scoreIndelsAgainstHaplotypes(perReadAlleleLikelihoodMap); + if (d == null) + continue; + scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } + + // if (scoreRA.observationCount() == 0) + // return null; + + // annotate the score in the info field + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean())); + return map; + + } + private static class HaplotypeComparator implements Comparator, Serializable { public int compare(Haplotype a, Haplotype b) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index c3cb01c23..f8abd59e3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; @@ -12,6 +13,8 @@ import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -23,7 +26,7 @@ import java.util.Map; /** * Total count across all samples of mapping quality zero reads */ -public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation { +public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, @@ -31,6 +34,17 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { + if (vc.isSNP() && stratifiedContexts != null) + return annotatePileup(ref, stratifiedContexts, vc); + else if (stratifiedPerReadAlleleLikelihoodMap != null) + return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); + else + return null; + } + + private Map annotatePileup(final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; @@ -48,6 +62,25 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA return map; } + private Map annotateWithLikelihoods(final Map stratifiedPerReadAlleleLikelihoodMap, + final VariantContext vc) { + if (stratifiedPerReadAlleleLikelihoodMap == null) + return null; + + int mq0 = 0; + for ( PerReadAlleleLikelihoodMap likelihoodMap : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (GATKSAMRecord read : likelihoodMap.getLikelihoodReadMap().keySet()) { + + if (read.getMappingQuality() == 0 ) + mq0++; + } + } + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", mq0)); + return map; + } + + public List getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); } public List getDescriptions() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 02e6a8508..a1bd8dcbd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -291,8 +291,12 @@ public class VariantAnnotatorEngine { final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); + AlignmentContext context = null; + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = null; + if (stratifiedContexts != null) + context = stratifiedContexts.get(genotype.getSampleName()); + if (stratifiedPerReadAlleleLikelihoodMap != null) + perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); if ( context == null && perReadAlleleLikelihoodMap == null) { // no likelihoods nor pileup available: just move on to next sample diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java index 7af4baddb..9c5710872 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -11,7 +11,7 @@ import java.util.Map; // TODO -- make this an abstract class when we move away from InfoFieldAnnotation public interface ActiveRegionBasedAnnotation extends AnnotationType { - // return annotations for the given contexts split by sample and then read likelihoof + // return annotations for the given contexts split by sample and then read likelihood public abstract Map annotate(final Map stratifiedContexts, final VariantContext vc); // return the descriptions used for the VCF INFO meta field From 9eec33ec3b1dd640456b1cc5584f008a8b5a9b7e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 20 Aug 2012 10:33:42 -0400 Subject: [PATCH 20/32] Complete GSA-497: Let Queue write out runInfo on the fly, after each job group finishes running -- Queue will incrementally now write out its jobReport.txt file whenever jobs finish running (FAIL or DONE) -- This makes it far easier to track what's going on, or to analyze incrementally performance results coming out of Queue -- Generally cleaned up the QJobsReporting code, creating a new clean class QJobsReporter that holds all of the information on what to do log and where to put into, which was previously scattered in QCommandLine and QJobReport --- .../sting/queue/QCommandLine.scala | 26 +--- .../sting/queue/engine/QGraph.scala | 64 ++++++++- .../sting/queue/util/QJobReport.scala | 69 +--------- .../sting/queue/util/QJobsReporter.scala | 121 ++++++++++++++++++ 4 files changed, 189 insertions(+), 91 deletions(-) create mode 100644 public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 041e84a8c..775847ba9 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -105,8 +105,7 @@ class QCommandLine extends CommandLineProgram with Logging { def execute = { if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) - - qGraph.settings = settings + qGraph.initializeWithSettings(settings) val allQScripts = pluginManager.createAllTypes(); for (script <- allQScripts) { @@ -137,26 +136,9 @@ class QCommandLine extends CommandLineProgram with Logging { logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) - if (!settings.disableJobReport) { - val jobStringName = { - if (settings.jobReportFile != null) - settings.jobReportFile - else - settings.qSettings.runName + ".jobreport.txt" - } - - if (!shuttingDown) { - val reportFile = IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) - logger.info("Writing JobLogging GATKReport to file " + reportFile) - QJobReport.printReport(functionsAndStatus, reportFile) - - if (settings.run) { - val pdfFile = IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf") - logger.info("Plotting JobLogging GATKReport to file " + pdfFile) - QJobReport.plotReport(reportFile, pdfFile) - } - } - } + // write the final complete job report + logger.info("Writing final jobs report...") + qGraph.writeJobsReport() if (!qGraph.success) { logger.info("Done with errors") diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index e3a1714ff..2c33596e1 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -39,7 +39,7 @@ import collection.immutable.{TreeSet, TreeMap} import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction} import java.util.Date import org.broadinstitute.sting.utils.Utils -import org.apache.commons.io.{FileUtils, IOUtils} +import org.apache.commons.io.{FilenameUtils, FileUtils, IOUtils} import java.io.{OutputStreamWriter, File} /** @@ -71,6 +71,16 @@ class QGraph extends Logging { private val inProcessManager = new InProcessJobManager private def managers = Seq[Any](inProcessManager, commandLineManager) + /** + * If true, we will write out incremental job reports + */ + private val INCREMENTAL_JOBS_REPORT = true + + /** + * Holds the optional jobInfoReporter structure + */ + private var jobInfoReporter: QJobsReporter = null + private class StatusCounts { var pending = 0 var running = 0 @@ -79,6 +89,19 @@ class QGraph extends Logging { } private val statusCounts = new StatusCounts + /** + * Final initialization step of this QGraph -- tell it runtime setting information + * + * The settings aren't necessarily available until after this QGraph object has been constructed, so + * this function must be called once the QGraphSettings have been filled in. + * + * @param settings + */ + def initializeWithSettings(settings: QGraphSettings) { + this.settings = settings + this.jobInfoReporter = createJobsReporter() + } + /** * Adds a QScript created CommandLineFunction to the graph. * @param command Function to add to the graph. @@ -467,6 +490,12 @@ class QGraph extends Logging { checkRetryJobs(failedJobs) } + // incremental + if ( logNextStatusCounts && INCREMENTAL_JOBS_REPORT ) { + logger.info("Writing incremental jobs reports...") + writeJobsReport(false) + } + readyJobs ++= getReadyJobs } @@ -1084,6 +1113,39 @@ class QGraph extends Logging { } } + /** + * Create the jobsReporter for this QGraph, based on the settings data. + * + * Must be called after settings has been initialized properly + * + * @return + */ + private def createJobsReporter(): QJobsReporter = { + val jobStringName = if (settings.jobReportFile != null) + settings.jobReportFile + else + settings.qSettings.runName + ".jobreport.txt" + + val reportFile = org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) + + val pdfFile = if ( settings.run ) + Some(org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf")) + else + None + + new QJobsReporter(settings.disableJobReport, reportFile, pdfFile) + } + + /** + * Write, if possible, the jobs report + */ + def writeJobsReport(plot: Boolean = true) { + // note: the previous logic didn't write the job report if the system was shutting down, but I don't + // see any reason not to write the job report + if ( jobInfoReporter != null ) + jobInfoReporter.write(this, plot) + } + /** * Returns true if the graph was shutdown instead of exiting on its own. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index c69a310b3..0600f9ad5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -25,13 +25,8 @@ package org.broadinstitute.sting.queue.util import org.broadinstitute.sting.queue.function.QFunction -import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} -import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.gatk.report.GATKReportTable import org.broadinstitute.sting.queue.engine.JobRunInfo -import java.io.{PrintStream, File} -import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} -import org.broadinstitute.sting.utils.io.Resource -import org.apache.commons.io.{IOUtils, FileUtils} /** * A mixin to add Job info to the class @@ -98,31 +93,10 @@ trait QJobReport extends Logging { } object QJobReport { - val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" - // todo -- fixme to have a unique name for Scatter/gather jobs as well var seenCounter = 1 var seenNames = Set[String]() - def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { - val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) - jobs foreach {case (qf, info) => qf.setRunInfo(info)} - val stream = new PrintStream(FileUtils.openOutputStream(dest)) - try { - printJobLogging(jobs.keys.toSeq, stream) - } finally { - IOUtils.closeQuietly(stream) - } - } - - def plotReport(reportFile: File, pdfFile: File) { - val executor = new RScriptExecutor - executor.addLibrary(RScriptLibrary.GSALIB) - executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) - executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) - executor.exec() - } - def workAroundSameJobNames(func: QFunction):String = { if ( seenNames.apply(func.jobName) ) { seenCounter += 1 @@ -132,45 +106,4 @@ object QJobReport { func.jobName } } - - /** - * Prints the JobLogging logs to a GATKReport. First splits up the - * logs by group, and for each group generates a GATKReportTable - */ - private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { - // create the report - val report: GATKReport = new GATKReport - - // create a table for each group of logs - for ( (group, groupLogs) <- groupLogs(logs) ) { - val keys = logKeys(groupLogs) - report.addTable(group, "Job logs for " + group, keys.size) - val table: GATKReportTable = report.getTable(group) - - // add the columns - keys.foreach(table.addColumn(_)) - for (log <- groupLogs) { - for ( key <- keys ) - table.set(log.getReportName, key, log.getReportFeature(key)) - } - } - - report.print(stream) - } - - private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { - logs.groupBy(_.getReportGroup) - } - - private def logKeys(logs: Seq[QFunction]): Set[String] = { - // the keys should be the same for each log, but we will check that - val keys = Set[String](logs(0).getReportFeatureNames : _*) - - for ( log <- logs ) - if ( keys.sameElements(Set(log.getReportFeatureNames)) ) - throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + - "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) - - keys - } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala new file mode 100644 index 000000000..a23fe4485 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.util + +import java.io.{PrintStream, File} +import org.broadinstitute.sting.utils.io.{Resource} +import org.broadinstitute.sting.queue.engine.{JobRunInfo, QGraph} +import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} +import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.apache.commons.io.{FileUtils, IOUtils} + +/** + * Writes out RunInfo to a GATKReport + */ +class QJobsReporter(val disabled: Boolean, val reportFile: File, val pdfFile: Option[File]) extends Logging { + private val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" + + /** + * Write out a job report based on the finished jobs graph + * @param jobGraph + * @param enabledPlotting if true, we will plot the report as well with the JOB_REPORT_QUEUE_SCRIPT + */ + def write(jobGraph: QGraph, enabledPlotting: Boolean) { + if ( ! disabled ) { + logger.info("Writing JobLogging GATKReport to file " + reportFile) + printReport(jobGraph.getFunctionsAndStatus, reportFile) + + if ( enabledPlotting ) + pdfFile match { + case Some(file) => + logger.info("Plotting JobLogging GATKReport to file " + file) + plotReport(reportFile, file) + case None => + } + } + } + + private def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { + val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) + jobs foreach {case (qf, info) => qf.setRunInfo(info)} + val stream = new PrintStream(FileUtils.openOutputStream(dest)) + try { + printJobLogging(jobs.keys.toSeq, stream) + } finally { + IOUtils.closeQuietly(stream) + } + } + + private def plotReport(reportFile: File, pdfFile: File) { + val executor = new RScriptExecutor + executor.addLibrary(RScriptLibrary.GSALIB) + executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) + executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) + executor.exec() + } + + /** + * Prints the JobLogging logs to a GATKReport. First splits up the + * logs by group, and for each group generates a GATKReportTable + */ + private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { + // create the report + val report: GATKReport = new GATKReport + + // create a table for each group of logs + for ( (group, groupLogs) <- groupLogs(logs) ) { + val keys = logKeys(groupLogs) + report.addTable(group, "Job logs for " + group, keys.size) + val table: GATKReportTable = report.getTable(group) + + // add the columns + keys.foreach(table.addColumn(_)) + for (log <- groupLogs) { + for ( key <- keys ) + table.set(log.getReportName, key, log.getReportFeature(key)) + } + } + + report.print(stream) + } + + private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { + logs.groupBy(_.getReportGroup) + } + + private def logKeys(logs: Seq[QFunction]): Set[String] = { + // the keys should be the same for each log, but we will check that + val keys = Set[String](logs(0).getReportFeatureNames : _*) + + for ( log <- logs ) + if ( keys.sameElements(Set(log.getReportFeatureNames)) ) + throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + + "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) + + keys + } +} From 6ce8016ae7b60d79f6a11a4dcc0835e392a5a600 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 20 Aug 2012 13:19:35 -0400 Subject: [PATCH 21/32] GSA-491: Add hidden tag to GATK that propagates to the GATK logs --- .../arguments/GATKArgumentCollection.java | 23 +++++++++++++++++++ .../sting/gatk/phonehome/GATKRunReport.java | 5 ++++ 2 files changed, 28 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index bbbd96cf1..f66e229bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -64,12 +64,35 @@ public class GATKArgumentCollection { @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; + // -------------------------------------------------------------------------------------------------------------- + // + // GATKRunReport options + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD; @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) public File gatkKeyFile = null; + /** + * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be + * used to group together runs during later analysis. One use of this capability is to tag runs as GATK + * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * + * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find + * meaningful. + */ + @Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false) + public String tag = "NA"; + + // -------------------------------------------------------------------------------------------------------------- + // + // XXX + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index b60a7845a..035252c14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -138,6 +138,9 @@ public class GATKRunReport { @Element(required = true, name = "iterations") private long nIterations; + @Element(required = true, name = "tag") + private String tag; + public enum PhoneHomeOption { /** Disable phone home */ NO_ET, @@ -186,6 +189,8 @@ public class GATKRunReport { nIterations = engine.getCumulativeMetrics().getNumIterations(); } + tag = engine.getArguments().tag; + // user and hostname -- information about the runner of the GATK userName = System.getProperty("user.name"); hostName = Utils.resolveHostname(); From 03017855e40e4c301d99a76573691a55232bc15c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 22 Aug 2012 00:24:01 -0400 Subject: [PATCH 25/32] WTF - why is support for whole-read insertions all messed up in LIBS? I've pushed a temporary patch for now (the right solution should certainly not be implemented in stable; LIBS needs to be better thought out). Added another unit test. --- .../gatk/iterators/LocusIteratorByState.java | 5 ++- .../LocusIteratorByStateUnitTest.java | 35 ++++++++++++++----- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index b5d5dfc0e..75af7976f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -304,7 +304,7 @@ public class LocusIteratorByState extends LocusIterator { final boolean isSingleElementCigar = nextElement == lastElement; final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator - final int readOffset = state.getReadOffset(); // the base offset on this read + int readOffset = state.getReadOffset(); // the base offset on this read final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; @@ -331,6 +331,9 @@ public class LocusIteratorByState extends LocusIterator { String insertedBaseString = null; if (nextOp == CigarOperator.I) { final int insertionOffset = isSingleElementCigar ? 0 : 1; + // TODO -- someone please implement a better fix for the single element insertion CIGAR! + if (isSingleElementCigar) + readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index e65a7fb27..edd97f17f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -208,19 +208,18 @@ public class LocusIteratorByStateUnitTest extends BaseTest { } /** - * Test to make sure that reads supporting only an indel (example cigar string: 76I) do - * not negatively influence the ordering of the pileup. + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly */ @Test - public void testWholeIndelReadTest2() { + public void testWholeIndelReadRepresentedTest() { final int firstLocus = 44367788, secondLocus = firstLocus + 1; - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read",0,secondLocus,1); - read.setReadBases(Utils.dupBytes((byte) 'A', 1)); - read.setBaseQualities(Utils.dupBytes((byte) '@', 1)); - read.setCigarString("1I"); + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); - List reads = Arrays.asList(read); + List reads = Arrays.asList(read1); // create the iterator by state with the fake reads and fake records li = makeLTBS(reads, createTestReadProperties()); @@ -234,6 +233,26 @@ public class LocusIteratorByStateUnitTest extends BaseTest { Assert.assertFalse(pe.isAfterInsertion()); Assert.assertEquals(pe.getEventBases(), "A"); } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } } private static ReadProperties createTestReadProperties() { From 20601f034e97456ba5f2c5d30b4ca0eca1348d76 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Wed, 22 Aug 2012 07:33:10 -0700 Subject: [PATCH 26/32] Updating the checkType() function to include the new StructuralIndel variant type. Fixes outstanding broken integration test. --- .../gatk/walkers/variantrecalibration/VariantDataManager.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index aacd987d5..33a543e39 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -286,6 +286,7 @@ public class VariantDataManager { case INDEL: case MIXED: case SYMBOLIC: + case STRUCTURAL_INDEL: return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); default: return false; From 9e76e8aa0bb75a12f69983a8c4790276a3bff4c5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 22 Aug 2012 11:26:08 -0400 Subject: [PATCH 27/32] Just noticed that the efficient conversion to uppercase method is redundant since it's already implemented efficiently in Picard; let's just have a single implementation. --- .../broadinstitute/sting/utils/BaseUtils.java | 26 +++---------------- .../sting/utils/variantcontext/Allele.java | 2 +- 2 files changed, 4 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 2d7f51c3f..8c95091a6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils; +import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.util.Arrays; @@ -444,29 +445,8 @@ public class BaseUtils { * @param bases the bases * @return the upper cased version */ - static public byte[] convertToUpperCase(final byte[] bases) { - for ( int i = 0; i < bases.length; i++ ) { - if ( (char)bases[i] >= 'a' ) - bases[i] = toUpperCaseBase(bases[i]); - } - return bases; - } - - static public byte toUpperCaseBase(final byte base) { - switch (base) { - case 'a': - return 'A'; - case 'c': - return 'C'; - case 'g': - return 'G'; - case 't': - return 'T'; - case 'n': - return 'N'; - default: - return base; - } + static public void convertToUpperCase(final byte[] bases) { + StringUtil.toUpperCase(bases); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java index 2c312678e..85c925204 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java @@ -105,7 +105,7 @@ public class Allele implements Comparable { if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); } else { - bases = BaseUtils.convertToUpperCase(bases); + BaseUtils.convertToUpperCase(bases); } this.isRef = isRef; From 901f47d8af97c0d61e6f06727a6242879dd1dfad Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 22 Aug 2012 11:38:51 -0400 Subject: [PATCH 28/32] Final step (for now) in VA refactoring: update MD5's because, a) since it's not guaranteed that we'll iterate through reads/pileups in the same order, the rank sum dithering will change annotations, b) FS uses new generic threshold to distinguish uninformative reads (it used to use ad-hoc thresholds), c) AD definition changed and throws away uninformative reads, d) shortened general ploidy integration tests for quicker debugging. May have missed some MD5's in the update so there may be lingering test failures still --- ...GenotyperGeneralPloidyIntegrationTest.java | 16 ++--- .../HaplotypeCallerIntegrationTest.java | 8 +-- .../annotator/DepthPerAlleleBySample.java | 2 +- .../walkers/annotator/HaplotypeScore.java | 2 +- .../walkers/annotator/MappingQualityZero.java | 4 +- .../gatk/walkers/annotator/RankSumTest.java | 3 + .../VariantAnnotatorIntegrationTest.java | 10 +-- .../UnifiedGenotyperIntegrationTest.java | 62 +++++++++---------- 8 files changed, 55 insertions(+), 52 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 6ae34f190..b5b0abc6e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -18,8 +18,8 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; final String REFSAMPLE_NAME = "NA12878"; - final String MTINTERVALS = "MT:1-3000"; - final String LSVINTERVALS = "20:40,000,000-41,000,000"; + final String MTINTERVALS = "MT:1-1000"; + final String LSVINTERVALS = "20:40,500,000-41,000,000"; final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; @@ -47,31 +47,31 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0934f72865388999efec64bd9d4a9b93"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","077db83cf7dc5490f670c85856b408b2"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","126581c72d287722437274d41b6fed7b"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","e460a17377b731ff4eab36fb56042ecd"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","b543aa1c3efedb301e525c1d6c50ed8d"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","9514ed15c7030b6d47e04e6a3a2b0a3e"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","55b20557a836bb92688e68f12d7f5dc4"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","26598044436c8044f22ffa767b06a0f0"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","7eb889e8e07182f4c3d64609591f9459"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da359fe7dd6dce045193198c264301ee"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "db8114877b99b14f7180fdcd24b040a7"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "ad0eef3a9deaa098d79df62af7e5448a"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index c766f363c..fc9d56660 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -20,17 +20,17 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "6b30c7e1b6bbe80d180d9d67441cec12"); + HCTest(CEUTRIO_BAM, "", "e5b4a0627a1d69b9356f8a7cd2260e89"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "4cdfbfeadef00725974828310558d7d4"); + HCTest(NA12878_BAM, "", "202d5b6edaf74f411c170099749f202f"); } @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "6183fb6e374976d7087150009685e043"); + HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "561931ba3919808ec471e745cb3148c7"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -41,7 +41,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "ab7593a7a60a2e9a66053572f1718df1"); + HCTestComplexVariants(CEUTRIO_BAM, "", "316a0fb9c66c0a6aa40a170d5d8c0021"); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index b3fc67c2f..320fe3148 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -59,7 +59,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) annotateWithLikelihoods(alleleLikelihoodMap, vc, gb); - else if ( vc.isSNP() && stratifiedContext != null) + else if ( stratifiedContext != null && (vc.isSNP())) annotateWithPileup(stratifiedContext, vc, gb); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 2bf060f12..6487eac50 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -71,7 +71,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final Map stratifiedPerReadAlleleLikelihoodMap) { if (vc.isSNP() && stratifiedContexts != null) return annotatePileup(ref, stratifiedContexts, vc); - else if (stratifiedPerReadAlleleLikelihoodMap != null) + else if (stratifiedPerReadAlleleLikelihoodMap != null && vc.isVariant()) return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); else return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index f8abd59e3..74f9c0d0c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -34,9 +34,9 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { - if (vc.isSNP() && stratifiedContexts != null) + if ((vc.isSNP() || !vc.isVariant()) && stratifiedContexts != null) return annotatePileup(ref, stratifiedContexts, vc); - else if (stratifiedPerReadAlleleLikelihoodMap != null) + else if (stratifiedPerReadAlleleLikelihoodMap != null && vc.isVariant()) return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); else return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index fb9f8603e..ec873c5dd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -67,6 +67,9 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR fillQualsFromPileup(vc.getAlleles(), vc.getStart(), pileup, indelLikelihoodMap, refQuals, altQuals ); } + if (refQuals.isEmpty() && altQuals.isEmpty()) + return null; + final MannWhitneyU mannWhitneyU = new MannWhitneyU(); for (final Double qual : altQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 17d27c156..aa4fd7a75 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("95b0627bfcac2191aed9908904e892ff")); + Arrays.asList("4a0318d0452d2dccde48ef081c431bf8")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("0e2509349fd6c8a9e9408c918215e1de")); + Arrays.asList("da19c8e3c58340ba8bcc88e95ece4ac1")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("32d81a7797605afb526983a2ab45efc2")); + Arrays.asList("cdefe79f46482a3d050ca2132604663a")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("350539ccecea0d1f7fffd4ac29c015e7")); + Arrays.asList("5ec4c07b6801fca7013e3b0beb8b5418")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("c222361819fae035a0162f876990fdee")); + Arrays.asList("28c07151f5c5fae87c691d8f7d1a3929")); executeTest("test overwriting header", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7b6e1ee96..7390ec206 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("0039fd0464c87e6ce66c4c8670fd8dfa")); + Arrays.asList("9a7fa3e9ec8350e3e9cfdce0c00ddcc3")); executeTest("test MultiSample Pilot1", spec); } @@ -36,7 +36,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d1e68d4db6585ec00213b1d2d05e01a9")); + Arrays.asList("78693f3bf5d588e250507a596aa400da")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("b53860d209f8440f12b78d01606553e1")); + Arrays.asList("babf24ec8e5b5708d4a049629f7ea073")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("61007c22c00a2871237280914a8f88f0")); + Arrays.asList("754187e70c1d117087e2270950a1c230")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("feda4a38bba096f7b740a146055509c2")); + Arrays.asList("f9a2f882d050a90e6d8e6a1fba00f858")); executeTest("test Multiple SNP alleles", spec); } @@ -76,7 +76,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("0ff525e65c5836289c454c76ead5d80e")); + Arrays.asList("8a4ad38ec8015eea3461295148143428")); executeTest("test reverse trim", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "e1a17f8f852c3d639f26e659d37bc1e5"; + private final static String COMPRESSED_OUTPUT_MD5 = "ebb42960e115fb8dacd3edff5541b4da"; @Test public void testCompressedOutput() { @@ -139,7 +139,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("b0b92abbaaa4c787dce6f1b302f983ee")); + Arrays.asList("91f7e112200ed2c3b0a5d0d9e16e9369")); executeTest("test min_base_quality_score 26", spec); } @@ -147,7 +147,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("186d33429756c89aad6cd89424d6dc94")); + Arrays.asList("b86e52b18496ab43a6b9a1bda632b5e6")); executeTest("test SLOD", spec); } @@ -155,7 +155,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("11b87f68b8530da168c1418513115f30")); + Arrays.asList("79b3e4f8b4476ce3c3acbc271d6ddcdc")); executeTest("test NDA", spec); } @@ -163,23 +163,23 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("d2be4b1af1f29579c4f96c08e1ddd871")); + Arrays.asList("bf7f21a600956eda0a357b97b21e3069")); executeTest("test using comp track", spec); } @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "0055bd060e6ef53a6b836903d68953c9"); + testOutputParameters("-sites_only", "976109543d8d97d94e0fe0521ff326e8"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "235bec0a7b2d901442261104db18f5eb"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "8084a847f4a3c53a030e8c52eec35cea"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "7c57ede7019063c19aa9d2136045d84f"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "931e396f2a6903a291e813c64c18f8b5"); } private void testOutputParameters(final String args, final String md5) { @@ -193,7 +193,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("3f8d724a5158adac4df38c4e2ed04167")); + Arrays.asList("e94be02fc5484c20b512840884e3d463")); executeTest("test confidence 1", spec1); } @@ -201,7 +201,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("3f8d724a5158adac4df38c4e2ed04167")); + Arrays.asList("e94be02fc5484c20b512840884e3d463")); executeTest("test confidence 2", spec2); } @@ -212,12 +212,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "7e7384a3a52e19f76f368c2f4561d510" ); + testHeterozosity( 0.01, "0dca2699f709793026b853c6f339bf08" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "3d16366d870c086e894c07c9da411795" ); + testHeterozosity( 1.0 / 1850, "35f14e436927e64712a8e28080e90c91" ); } private void testHeterozosity(final double arg, final String md5) { @@ -241,7 +241,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("58abc4f504d3afd42271e290ac846c4b")); + Arrays.asList("0360b79163aa28ae66d0dde4c26b3d76")); executeTest(String.format("test multiple technologies"), spec); } @@ -260,7 +260,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("e247f579f01eb698cfa1ae1e8a3995a8")); + Arrays.asList("59892388916bdfa544750ab76e43eabb")); executeTest(String.format("test calling with BAQ"), spec); } @@ -279,7 +279,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("cc2167dce156f70f5a31ac3dce499266")); + Arrays.asList("6aa034f669ec09ac4f5a28624cbe1830")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -294,7 +294,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1268bde77842e6bb6a4f337c1d589f4d")); + Arrays.asList("ba7a011d0c665acc4455d58a6ab28716")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -307,7 +307,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("10c86ff98ad5ab800d208b435bcfbd7d")); + Arrays.asList("4f7d80f4f53ef0f0959414cb30097482")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("c0c4dbb050296633a3150b104b77e05a")); + Arrays.asList("95986d0c92436d3b9c1f1be9c768a368")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2472722f87f8718861698f60bbba2462")); + Arrays.asList("cecd3e35a817e299e97e8f7bbf083d2c")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -335,13 +335,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("eeb64b261f0a44aa478d753dbbf9378e")); + Arrays.asList("c3f786a5228346b43a80aa80d22b1490")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("d0a66c234056bb83dd84113bc2421f1e")); + Arrays.asList("1a4d856bfe53d9acee0ea303c4b83bb1")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -351,7 +351,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("db0f91abb901e097714d8755058e1319")); + Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } @@ -363,7 +363,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("b3c923ed9efa04b85fc18a9b45c8d2a6")); + Arrays.asList("59ff26d7e5ca2503ebe9f74902251551")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -397,7 +397,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("160600dfa8e46f91dbb5d574517aac74")); + Arrays.asList("f99f9a917529bfef717fad97f725d5df")); executeTest("test minIndelFraction 0.0", spec); } @@ -405,7 +405,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("aa58dc9f77132c30363562bcdc321f6e")); + Arrays.asList("eac2cd649bd5836068350eb4260aaea7")); executeTest("test minIndelFraction 0.25", spec); } From 94540ccc278ffe7374abcebc2e1227c859b3159f Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 22 Aug 2012 12:54:29 -0400 Subject: [PATCH 29/32] Using the simple VCBuilder constructor and then subsequently trying to modify attributes was throwing a NPE. This is easily solved (without a performance hit) by initializing the attributes map to an immutable Collections.emptyMap(). Added unit test to cover this case. --- .../sting/utils/variantcontext/VariantContextBuilder.java | 1 + .../sting/utils/variantcontext/VariantContextUnitTest.java | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index d8ab4bd23..40ac089ef 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -94,6 +94,7 @@ public class VariantContextBuilder { this.start = start; this.stop = stop; this.alleles = alleles; + this.attributes = Collections.emptyMap(); // immutable toValidate.add(VariantContext.Validation.ALLELES); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index 272166c68..19620b8df 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -750,6 +750,10 @@ public class VariantContextUnitTest extends BaseTest { modified = new VariantContextBuilder(modified).attributes(null).attribute("AC", 1).make(); Assert.assertEquals(modified.getAttribute("AC"), 1); + // test the behavior when the builder's attribute object is not initialized + modified = new VariantContextBuilder(modified.getSource(), modified.getChr(), modified.getStart(), modified.getEnd(), modified.getAlleles()).attribute("AC", 1).make(); + + // test normal attribute modification modified = new VariantContextBuilder(cfg.vc).attribute("AC", 1).make(); Assert.assertEquals(modified.getAttribute("AC"), 1); modified = new VariantContextBuilder(modified).attribute("AC", 2).make(); From 944e1c299df52909d46deb9eae0138376e0b8001 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 22 Aug 2012 13:07:13 -0400 Subject: [PATCH 30/32] Docs for --keepOriginalAC were wrong in SelectVariants --- .../sting/gatk/walkers/variantutils/SelectVariants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 4c0c0cabf..fc29a7f02 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -265,7 +265,7 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false) private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL; - @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; /** From 63213e8eb5d45173c99493813cde4af1d5e46416 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 22 Aug 2012 14:18:44 -0400 Subject: [PATCH 31/32] Expanding the HaplotypeCaller integration tests to cover a wider range of data --- .../HaplotypeCallerIntegrationTest.java | 30 ++++++++++++++++--- .../annotator/ClippingRankSumTest.java | 3 -- .../walkers/annotator/ReadPosRankSumTest.java | 1 - .../utils/activeregion/ActiveRegion.java | 1 - 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index fc9d56660..2ae1f2ca5 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -8,9 +8,10 @@ import java.util.Arrays; public class HaplotypeCallerIntegrationTest extends WalkerTest { final static String REF = b37KGReference; final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + final String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + final String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam"; final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; - //final String RECAL_FILE = validationDataLocation + "NA12878.kmer.8.subset.recal_data.bqsr"; private void HCTest(String bam, String args, String md5) { final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; @@ -34,14 +35,35 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } private void HCTestComplexVariants(String bam, String args, String md5) { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 3"; + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 2"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); } @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(CEUTRIO_BAM, "", "316a0fb9c66c0a6aa40a170d5d8c0021"); + HCTestComplexVariants(CEUTRIO_BAM, "", "3424b398a9f47c8ac606a5c56eb7d8a7"); + } + + private void HCTestSymbolicVariants(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerSingleSampleSymbolic() { + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "b71cfaea9390136c584c9671b149d573"); + } + + private void HCTestIndelQualityScores(String bam, String args, String md5) { + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2"; + final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); + executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec); + } + + @Test + public void testHaplotypeCallerSingleSampleIndelQualityScores() { + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "e1f88fac91424740c0eaac1de48b3970"); } } - diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index 449e047cd..1fd220f2f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -1,11 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 95fadfd46..1ac8ee113 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -5,7 +5,6 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 6756c1c02..1425800d8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -73,7 +73,6 @@ public class ActiveRegion implements HasGenomeLocation { Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } - @Override public GenomeLoc getLocation() { return activeRegionLoc; } public GenomeLoc getExtendedLoc() { return extendedLoc; } From e5cfdb481198d4b3aac43bf6094f71a1396dbaec Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 22 Aug 2012 14:39:35 -0400 Subject: [PATCH 32/32] Bug fix for popular _Duplicate allele added to VariantContext_ error reported on the forum. It seems to be due to lower case bases in the reference being treated as reference mismatches. We would try to turn these mismatches into SNP events, for example c/C. We now uppercase the result from IndexedFastaSequenceFile.getSubsequenceAt() --- .../haplotypecaller/GenotypingEngine.java | 53 +++++-------------- .../haplotypecaller/HaplotypeCaller.java | 22 ++++---- .../haplotypecaller/HaplotypeResolver.java | 5 +- .../SimpleDeBruijnAssembler.java | 2 +- .../GenotypingEngineUnitTest.java | 3 +- .../utils/activeregion/ActiveRegion.java | 17 +++--- 6 files changed, 37 insertions(+), 65 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index c56cf5bf2..9de9b3292 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -42,14 +42,12 @@ import java.util.*; public class GenotypingEngine { private final boolean DEBUG; - private final int MNP_LOOK_AHEAD; private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public GenotypingEngine( final boolean DEBUG, final int MNP_LOOK_AHEAD, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { this.DEBUG = DEBUG; - this.MNP_LOOK_AHEAD = MNP_LOOK_AHEAD; this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; noCall.add(Allele.NO_CALL); } @@ -120,7 +118,7 @@ public class GenotypingEngine { System.out.println( "> Cigar = " + h.getCigar() ); } // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) ); + h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); startPosKeySet.addAll(h.getEventMap().keySet()); } @@ -203,7 +201,7 @@ public class GenotypingEngine { if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) ); + h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { System.out.println( h.toString() ); @@ -521,11 +519,7 @@ public class GenotypingEngine { return false; } - protected static HashMap generateVCsFromAlignment( final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) { - return generateVCsFromAlignment(null, alignmentStartHapwrtRef, cigar, ref, alignment, refLoc, sourceNameToAdd, MNP_LOOK_AHEAD); // BUGBUG: needed for compatibility with HaplotypeResolver code - } - - protected static HashMap generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) { + protected static HashMap generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { final HashMap vcs = new HashMap(); int refPos = alignmentStartHapwrtRef; @@ -543,7 +537,7 @@ public class GenotypingEngine { if( BaseUtils.isRegularBase(refByte) ) { insertionAlleles.add( Allele.create(refByte, true) ); } - if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) { + if( (haplotype.leftBreakPoint != 0 || haplotype.rightBreakPoint != 0) && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) { insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE ); } else { byte[] insertionBases = new byte[]{}; @@ -590,39 +584,16 @@ public class GenotypingEngine { case EQ: case X: { - int numSinceMismatch = -1; - int stopOfMismatch = -1; - int startOfMismatch = -1; - int refPosStartOfMismatch = -1; for( int iii = 0; iii < elementLength; iii++ ) { - if( ref[refPos] != alignment[alignmentPos] && alignment[alignmentPos] != ((byte) 'N') ) { - // SNP or start of possible MNP - if( stopOfMismatch == -1 ) { - startOfMismatch = alignmentPos; - stopOfMismatch = alignmentPos; - numSinceMismatch = 0; - refPosStartOfMismatch = refPos; - } else { - stopOfMismatch = alignmentPos; - } - } - if( stopOfMismatch != -1) { - numSinceMismatch++; - } - if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) { - final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 ); - final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 ); - if( BaseUtils.isAllRegularBases(refBases) && BaseUtils.isAllRegularBases(mismatchBases) ) { + final byte refByte = ref[refPos]; + final byte altByte = alignment[alignmentPos]; + if( refByte != altByte ) { // SNP! + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { final ArrayList snpAlleles = new ArrayList(); - snpAlleles.add( Allele.create( refBases, true ) ); - snpAlleles.add( Allele.create( mismatchBases, false ) ); - final int snpStart = refLoc.getStart() + refPosStartOfMismatch; - vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make()); + snpAlleles.add( Allele.create( refByte, true ) ); + snpAlleles.add( Allele.create( altByte, false ) ); + vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); } - numSinceMismatch = -1; - stopOfMismatch = -1; - startOfMismatch = -1; - refPosStartOfMismatch = -1; } refPos++; alignmentPos++; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 4f434bba6..8079c2e1f 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -122,10 +122,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) protected String keepRG = null; - @Hidden - @Argument(fullName="mnpLookAhead", shortName="mnpLookAhead", doc = "The number of bases to combine together to form MNPs out of nearby consecutive SNPs on the same haplotype", required = false) - protected int MNP_LOOK_AHEAD = 0; - @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; @@ -138,7 +134,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; @Advanced - @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Gap continuation penalty for use in the Pair HMM", required = false) + @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) @@ -192,21 +188,21 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @ArgumentCollection private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection(); - // the calculation arguments - private UnifiedGenotyperEngine UG_engine = null; - private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; - @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false) protected boolean DEBUG; + // the UG engines + private UnifiedGenotyperEngine UG_engine = null; + private UnifiedGenotyperEngine UG_engine_simple_genotyper = null; + // the assembly engine - LocalAssemblyEngine assemblyEngine = null; + private LocalAssemblyEngine assemblyEngine = null; // the likelihoods engine - LikelihoodCalculationEngine likelihoodCalculationEngine = null; + private LikelihoodCalculationEngine likelihoodCalculationEngine = null; // the genotyping engine - GenotypingEngine genotypingEngine = null; + private GenotypingEngine genotypingEngine = null; // the annotation engine private VariantAnnotatorEngine annotationEngine; @@ -292,7 +288,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); - genotypingEngine = new GenotypingEngine( DEBUG, MNP_LOOK_AHEAD, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); } //--------------------------------------------------------------------------------------------------------------- diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java index 38f11cc5d..8a401439b 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeResolver.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.SWPairwiseAlignment; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; @@ -337,8 +338,8 @@ public class HaplotypeResolver extends RodWalker { } // order results by start position - final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1, 0)); - final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2, 0)); + final TreeMap source1Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source1Haplotype), 0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1)); + final TreeMap source2Map = new TreeMap(GenotypingEngine.generateVCsFromAlignment(new Haplotype(source2Haplotype), 0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2)); if ( source1Map.size() == 0 || source2Map.size() == 0 ) { // TODO -- handle errors appropriately logger.debug("No source alleles; aborting at " + refContext.getLocus()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index 56cb6c3d4..93fd36a22 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -292,7 +292,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present - final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place + final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java index 539190fe9..07e7b0d92 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngineUnitTest.java @@ -142,7 +142,6 @@ public class GenotypingEngineUnitTest extends BaseTest { byte[] ref; byte[] hap; HashMap expected; - GenotypingEngine ge = new GenotypingEngine(false, 0, false); public BasicGenotypingTestProvider(String refString, String hapString, HashMap expected) { super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString)); @@ -153,7 +152,7 @@ public class GenotypingEngineUnitTest extends BaseTest { public HashMap calcAlignment() { final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap); - return ge.generateVCsFromAlignment( alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name", 0); + return GenotypingEngine.generateVCsFromAlignment( new Haplotype(hap), alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name"); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 8e660350f..dab8ddc78 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.activeregion; import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; @@ -58,9 +59,7 @@ public class ActiveRegion implements HasGenomeLocation, Comparable } public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return referenceReader.getSubsequenceAt( extendedLoc.getContig(), - Math.max(1, extendedLoc.getStart() - padding), - Math.min(referenceReader.getSequenceDictionary().getSequence(extendedLoc.getContig()).getSequenceLength(), extendedLoc.getStop() + padding) ).getBases(); + return getReference( referenceReader, padding, extendedLoc ); } public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { @@ -68,9 +67,15 @@ public class ActiveRegion implements HasGenomeLocation, Comparable } public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { - return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), - Math.max(1, fullExtentReferenceLoc.getStart() - padding), - Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); + return getReference( referenceReader, padding, fullExtentReferenceLoc ); + } + + private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), + Math.max(1, genomeLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); + StringUtil.toUpperCase(reference); + return reference; } @Override