diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java index ed80dce0d..9c4af8512 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java @@ -48,8 +48,16 @@ public class AlleleFrequencyCalculationResult { double log10LikelihoodOfAFzero = 0.0; double log10PosteriorOfAFzero = 0.0; - AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { + public AlleleFrequencyCalculationResult(int maxAltAlleles, int numChr) { log10AlleleFrequencyLikelihoods = new double[maxAltAlleles][numChr+1]; log10AlleleFrequencyPosteriors = new double[maxAltAlleles][numChr+1]; } + + public double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } + + public double getLog10PosteriorOfAFzero() { + return log10PosteriorOfAFzero; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java index 5c0d6cb87..62305d3c0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java @@ -38,10 +38,10 @@ public abstract class FrequencyModeSelector implements Cloneable{ protected FrequencyModeSelector(GenomeLocParser parser) { this.parser = parser; } - protected void logCurrentSiteData(VariantContext vc, VariantContext subVC) { - logCurrentSiteData(vc, subVC, false, false); + protected void logCurrentSiteData(VariantContext vc, boolean passesCriteria) { + logCurrentSiteData(vc, passesCriteria, false, false); } - protected abstract void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC); + protected abstract void logCurrentSiteData(VariantContext vc, boolean included, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC); protected abstract ArrayList selectValidationSites(int numValidationSites); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index d229e718e..ff3fe6506 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -23,21 +23,52 @@ */ package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; +import org.broadinstitute.sting.gatk.walkers.genotyper.AlleleFrequencyCalculationResult; +import org.broadinstitute.sting.gatk.walkers.genotyper.ExactAFCalculationModel; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.TreeSet; public class GLBasedSampleSelector extends SampleSelector { - public GLBasedSampleSelector(TreeSet sm) { + Map numAllelePriorMatrix = new HashMap(); + double referenceLikelihood; + public GLBasedSampleSelector(TreeSet sm, double refLik) { super(sm); + referenceLikelihood = refLik; } - public VariantContext subsetSiteToSamples(VariantContext vc) { - /* todo - Look at sample array, and create a new vc with samples for which GL's indicate they should be included. - For example, include all samples (and corresponding genotypes) whose GL's are such that argmax(GL) = HET or HOMVAR. */ - throw new ReviewedStingException("GLBasedSampleSelector not implemented yet!"); - //return true; + public boolean selectSiteInSamples(VariantContext vc) { + if ( samples == null || samples.isEmpty() ) + return true; + // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model) + // first subset to the samples + VariantContext subContext = vc.subContextFromSamples(samples); + + // now check to see (using EXACT model) whether this should be variant + // do we want to apply a prior? maybe user-spec? + double[][] flatPrior = createFlatPrior(vc.getAlleles()); + AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size(),2*samples.size()); + ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPrior,result,true); + // do we want to let this qual go up or down? + if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { + return true; + } + + return false; + } + + private double[][] createFlatPrior(List alleles) { + if ( ! numAllelePriorMatrix.containsKey(alleles.size()) ) { + numAllelePriorMatrix.put(alleles.size(), new double[alleles.size()][1+2*samples.size()]); + } + + return numAllelePriorMatrix.get(alleles.size()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java index 44fca71fb..c3987b9db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java @@ -38,14 +38,18 @@ public class GTBasedSampleSelector extends SampleSelector{ super(sm); } - public VariantContext subsetSiteToSamples(VariantContext vc) { + public boolean selectSiteInSamples(VariantContext vc) { // Super class already defined initialization which filled data structure "samples" with desired samples. // We only need to check if current vc if polymorphic in that set of samples if ( samples == null || samples.isEmpty() ) - return vc; + return true; - return vc.subContextFromSamples(samples, vc.getAlleles()); + VariantContext subContext = vc.subContextFromSamples(samples, vc.getAlleles()); + if ( subContext.isPolymorphicInSamples() ) { + return true; + } + return false; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java index 739cfcab4..15274d21c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java @@ -60,7 +60,7 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector { postSampleSelectionHistogram = new int[NUM_BINS]; } - public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { // this method is called for every variant of a selected type, regardless of whether it will be selectable or not // get AC,AF,AN attributes from vc @@ -107,7 +107,7 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector { numTotalSites++; // now process VC subsetted to samples of interest - if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC) return; //System.out.format("Post:%4.4f %d\n",af0, binIndex); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java index 35b8893c2..a48bcb8a1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java @@ -33,8 +33,7 @@ public class NullSampleSelector extends SampleSelector{ super(sm); } - public VariantContext subsetSiteToSamples(VariantContext vc) { - return vc; - + public boolean selectSiteInSamples(VariantContext vc) { + return true; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java index e75fcef5d..afbff93d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java @@ -34,7 +34,7 @@ public abstract class SampleSelector implements Cloneable { samples = new TreeSet(sm); } - protected abstract VariantContext subsetSiteToSamples(VariantContext vc); + protected abstract boolean selectSiteInSamples(VariantContext vc); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java index 506a6b2e4..66720a252 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java @@ -42,14 +42,14 @@ public class UniformSamplingFrequencySelector extends FrequencyModeSelector { } - public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + public void logCurrentSiteData(VariantContext vc, boolean selectedInTargetSamples, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { HashMap attributes = new HashMap(); if (vc.hasGenotypes() && !IGNORE_GENOTYPES) { // recompute AF,AC,AN based on genotypes: VariantContextUtils.calculateChromosomeCounts(vc, attributes, false); - if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + if (! selectedInTargetSamples && !IGNORE_POLYMORPHIC) return; } else { if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java index 10d2f639c..ae11d8102 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java @@ -124,6 +124,10 @@ public class ValidationSiteSelectorWalker extends RodWalker { @Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false) private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE; + @Argument(shortName="samplePNonref",fullName="samplePNonref", doc="GL-based selection mode only: the probability" + + " that a site is non-reference in the samples for which to include the site",required=false) + private double samplePNonref = 0.99; + @Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true) private int numValidationSites; @@ -231,13 +235,14 @@ public class ValidationSiteSelectorWalker extends RodWalker { continue; - // do anything required by frequency selector before we select for samples - VariantContext subVC; + // does this site pass the criteria for the samples we are interested in? + boolean passesSampleSelectionCriteria; if (samples.isEmpty()) - subVC = vc; + passesSampleSelectionCriteria = true; else - subVC = sampleSelector.subsetSiteToSamples(vc); - frequencyModeSelector.logCurrentSiteData(vc, subVC, IGNORE_GENOTYPES, IGNORE_POLYMORPHIC); + passesSampleSelectionCriteria = sampleSelector.selectSiteInSamples(vc); + + frequencyModeSelector.logCurrentSiteData(vc,passesSampleSelectionCriteria,IGNORE_GENOTYPES,IGNORE_POLYMORPHIC); } return 1; } @@ -263,7 +268,7 @@ public class ValidationSiteSelectorWalker extends RodWalker { SampleSelector sm; switch ( sampleMode ) { case POLY_BASED_ON_GL: - sm = new GLBasedSampleSelector(samples); + sm = new GLBasedSampleSelector(samples, Math.log10(1.0-samplePNonref)); break; case POLY_BASED_ON_GT: sm = new GTBasedSampleSelector(samples); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index f440eda5d..7eb853097 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -104,6 +104,10 @@ public class ClippingOp { break; + case REVERT_SOFTCLIPPED_BASES: + read = revertSoftClippedBases(read); + break; + default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } @@ -111,6 +115,38 @@ public class ClippingOp { return read; } + private GATKSAMRecord revertSoftClippedBases(GATKSAMRecord read) { + GATKSAMRecord unclipped; + + // shallow copy of the read bases and quals should be fine here because they are immutable in the original read + try { + unclipped = (GATKSAMRecord) read.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } + + Cigar unclippedCigar = new Cigar(); + int matchesCount = 0; + for (CigarElement element : read.getCigar().getCigarElements()) { + if (element.getOperator() == CigarOperator.SOFT_CLIP || element.getOperator() == CigarOperator.MATCH_OR_MISMATCH) + matchesCount += element.getLength(); + else if (matchesCount > 0) { + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + matchesCount = 0; + unclippedCigar.add(element); + } + else + unclippedCigar.add(element); + } + if (matchesCount > 0) + unclippedCigar.add(new CigarElement(matchesCount, CigarOperator.MATCH_OR_MISMATCH)); + + unclipped.setCigar(unclippedCigar); + unclipped.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), unclippedCigar)); + + return unclipped; + } + /** * Given a cigar string, get the number of bases hard or soft clipped at the start */ @@ -472,7 +508,7 @@ public class ClippingOp { for (CigarElement cigarElement : oldCigar.getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP ) - oldShift += Math.min(cigarElement.getLength(), newShift - oldShift); + oldShift += cigarElement.getLength(); else if (readHasStarted) break; } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java index c9d8730d1..f0765665a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingRepresentation.java @@ -29,5 +29,10 @@ public enum ClippingRepresentation { * lossy) operation. Note that this can only be applied to cases where the clipped * bases occur at the start or end of a read. */ - HARDCLIP_BASES + HARDCLIP_BASES, + + /** + * Turn all soft-clipped bases into matches + */ + REVERT_SOFTCLIPPED_BASES, } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 6e3f37980..f19a7a04f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -213,4 +213,9 @@ public class ReadClipper { } return clipRead(ClippingRepresentation.HARDCLIP_BASES); } + + public GATKSAMRecord revertSoftClippedBases() { + this.addOp(new ClippingOp(0, 0)); // UNSOFTCLIP_BASES doesn't need coordinates + return this.clipRead(ClippingRepresentation.REVERT_SOFTCLIPPED_BASES); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index ac1f00437..a46a7f0bb 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -851,50 +851,6 @@ public class ReadUtils { return new Pair(readBases, fallsInsideDeletion); } - public static GATKSAMRecord unclipSoftClippedBases(GATKSAMRecord read) { - int newReadStart = read.getAlignmentStart(); - int newReadEnd = read.getAlignmentEnd(); - List newCigarElements = new ArrayList(read.getCigar().getCigarElements().size()); - int heldOver = -1; - boolean sSeen = false; - for ( CigarElement e : read.getCigar().getCigarElements() ) { - if ( e.getOperator().equals(CigarOperator.S) ) { - newCigarElements.add(new CigarElement(e.getLength(),CigarOperator.M)); - if ( sSeen ) { - newReadEnd += e.getLength(); - sSeen = true; - } else { - newReadStart -= e.getLength(); - } - } else { - newCigarElements.add(e); - } - } - // merge duplicate operators together - int idx = 0; - List finalCigarElements = new ArrayList(read.getCigar().getCigarElements().size()); - while ( idx < newCigarElements.size() -1 ) { - if ( newCigarElements.get(idx).getOperator().equals(newCigarElements.get(idx+1).getOperator()) ) { - int combSize = newCigarElements.get(idx).getLength(); - int offset = 0; - while ( idx + offset < newCigarElements.size()-1 && newCigarElements.get(idx+offset).getOperator().equals(newCigarElements.get(idx+1+offset).getOperator()) ) { - combSize += newCigarElements.get(idx+offset+1).getLength(); - offset++; - } - finalCigarElements.add(new CigarElement(combSize,newCigarElements.get(idx).getOperator())); - idx = idx + offset -1; - } else { - finalCigarElements.add(newCigarElements.get(idx)); - } - idx++; - } - - read.setCigar(new Cigar(finalCigarElements)); - read.setAlignmentStart(newReadStart); - - return read; - } - /** * Compares two SAMRecords only the basis on alignment start. Note that * comparisons are performed ONLY on the basis of alignment start; any diff --git a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index 3b4b0abce..e228762b7 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -279,9 +279,9 @@ public class ReadClipperUnitTest extends BaseTest { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); GATKSAMRecord clippedRead = (new ReadClipper(read)).hardClipLeadingInsertions(); - int expectedLength = read.getReadLength() - leadingInsertionLength(read.getCigar()); + int expectedLength = read.getReadLength() - leadingCigarElementLength(read.getCigar(), CigarOperator.INSERTION); if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) - expectedLength -= leadingInsertionLength(ReadClipperTestUtils.invertCigar(read.getCigar())); + expectedLength -= leadingCigarElementLength(ReadClipperTestUtils.invertCigar(read.getCigar()), CigarOperator.INSERTION); if (! clippedRead.isEmpty()) { Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there @@ -293,6 +293,27 @@ public class ReadClipperUnitTest extends BaseTest { } } + @Test(enabled = true) + public void testRevertSoftClippedBases() + { + for (Cigar cigar: cigarList) { + final int leadingSoftClips = leadingCigarElementLength(cigar, CigarOperator.SOFT_CLIP); + final int tailSoftClips = leadingCigarElementLength(ReadClipperTestUtils.invertCigar(cigar), CigarOperator.SOFT_CLIP); + + final GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); + final GATKSAMRecord unclipped = (new ReadClipper(read)).revertSoftClippedBases(); + + if ( leadingSoftClips > 0 || tailSoftClips > 0) { + final int expectedStart = read.getAlignmentStart() - leadingSoftClips; + final int expectedEnd = read.getAlignmentEnd() + tailSoftClips; + + Assert.assertEquals(unclipped.getAlignmentStart(), expectedStart); + Assert.assertEquals(unclipped.getAlignmentEnd(), expectedEnd); + } + else + Assert.assertEquals(read.getCigarString(), unclipped.getCigarString()); + } + } private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { @@ -304,12 +325,12 @@ public class ReadClipperUnitTest extends BaseTest { } private boolean startsWithInsertion(Cigar cigar) { - return leadingInsertionLength(cigar) > 0; + return leadingCigarElementLength(cigar, CigarOperator.INSERTION) > 0; } - private int leadingInsertionLength(Cigar cigar) { + private int leadingCigarElementLength(Cigar cigar, CigarOperator operator) { for (CigarElement cigarElement : cigar.getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.INSERTION) + if (cigarElement.getOperator() == operator) return cigarElement.getLength(); if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) break;