diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 7e5f388b2..dc85e95e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -776,9 +776,9 @@ public class UnifiedGenotyperEngine { * * @return genotypes */ - public GenotypesContext assignGenotypes(final VariantContext vc, - final boolean[] allelesToUse, - final List newAlleles) { + public static GenotypesContext assignGenotypes(final VariantContext vc, + final boolean[] allelesToUse, + final List newAlleles) { // the no-called genotypes final GenotypesContext GLs = vc.getGenotypes(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java new file mode 100644 index 000000000..5c0d6cb87 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/FrequencyModeSelector.java @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.ArrayList; +import java.util.HashMap; + +public abstract class FrequencyModeSelector implements Cloneable{ + + protected GenomeLocParser parser; + + protected FrequencyModeSelector(GenomeLocParser parser) { + this.parser = parser; + } + protected void logCurrentSiteData(VariantContext vc, VariantContext subVC) { + logCurrentSiteData(vc, subVC, false, false); + } + protected abstract void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC); + protected abstract ArrayList selectValidationSites(int numValidationSites); + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java new file mode 100644 index 000000000..d229e718e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.TreeSet; + + +public class GLBasedSampleSelector extends SampleSelector { + public GLBasedSampleSelector(TreeSet sm) { + super(sm); + } + + public VariantContext subsetSiteToSamples(VariantContext vc) { + /* todo - Look at sample array, and create a new vc with samples for which GL's indicate they should be included. + For example, include all samples (and corresponding genotypes) whose GL's are such that argmax(GL) = HET or HOMVAR. */ + throw new ReviewedStingException("GLBasedSampleSelector not implemented yet!"); + //return true; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java new file mode 100644 index 000000000..44fca71fb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.ArrayList; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + + +public class GTBasedSampleSelector extends SampleSelector{ + public GTBasedSampleSelector(TreeSet sm) { + super(sm); + } + + public VariantContext subsetSiteToSamples(VariantContext vc) { + // Super class already defined initialization which filled data structure "samples" with desired samples. + // We only need to check if current vc if polymorphic in that set of samples + + if ( samples == null || samples.isEmpty() ) + return vc; + + return vc.subContextFromSamples(samples, vc.getAlleles()); + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java new file mode 100644 index 000000000..af6a52002 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GenomeEvent.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.util.HashMap; +import java.util.List; + + +public class GenomeEvent implements Comparable { + final protected GenomeLoc loc; + /** A set of the alleles segregating in this context */ + final protected List alleles; + final protected Byte refBase; +// final protected HashMap attributes; + + public GenomeEvent(GenomeLocParser parser, final String contig, final int start, final int stop, final List alleles, HashMap attributes, + byte base) { + this.loc = parser.createGenomeLoc(contig, start, stop); + this.alleles = alleles; + this.refBase = base; +// this.attributes = attributes; + } + + // Routine to compare two variant contexts (useful to sort collections of vc's). + // By default, we want to sort first by contig, then by start location + + public GenomeLoc getGenomeLoc() { + return loc; + } + public int compareTo(final Object o) { + if (!(o instanceof GenomeEvent)) + throw new ReviewedStingException("BUG: comparing variant context with non-VC object"); + + GenomeEvent otherEvent = (GenomeEvent)o; + + return loc.compareTo(otherEvent.getGenomeLoc()); + } + + public VariantContext createVariantContextFromEvent() { + return new VariantContextBuilder("event", loc.getContig(), loc.getStart(), loc.getStop(), alleles) + .log10PError(0.0).referenceBaseForIndel(refBase).make(); + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java new file mode 100644 index 000000000..739cfcab4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + + +public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector { + + private static final boolean DEBUG = true; + + private int NUM_BINS = 20; + + private int[] preSampleSelectionHistogram; + private int numTotalSites = 0; + private int[] postSampleSelectionHistogram; + private int numSampleSelectedSites = 0; + private ArrayList[] binnedEventArray; + + public KeepAFSpectrumFrequencySelector(int numBins, GenomeLocParser parser) { + super(parser); + NUM_BINS = numBins; + // initialize arrays dependent on NUM_BINS + binnedEventArray = new ArrayList[NUM_BINS]; + + for (int k=0; k < NUM_BINS; k++) + binnedEventArray[k] = new ArrayList(); + + preSampleSelectionHistogram = new int[NUM_BINS]; + postSampleSelectionHistogram = new int[NUM_BINS]; + } + + public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + + // this method is called for every variant of a selected type, regardless of whether it will be selectable or not + // get AC,AF,AN attributes from vc + HashMap attributes = new HashMap(); + double[] afArray = null; + + if (vc.hasGenotypes() && !IGNORE_GENOTYPES) { + // recompute AF,AC,AN based on genotypes: + // todo - - maybe too inefficient?? + VariantContextUtils.calculateChromosomeCounts(vc, attributes, false); + afArray = new double[] {Double.valueOf((String)attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY))}; + } else { + // sites-only vc or we explicitly tell to ignore genotypes; we trust the AF field if present + if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { + String afo = vc.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null); + + if (afo.contains(",")) { + String[] afs = afo.split(","); + afs[0] = afs[0].substring(1,afs[0].length()); + afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); + + afArray = new double[afs.length]; + + for (int k=0; k < afArray.length; k++) + afArray[k] = Double.valueOf(afs[k]); + } + else + afArray = new double[] {Double.valueOf(afo)}; + } + } + + + if (afArray == null ) + return; + + double af0 = MathUtils.arrayMax(afArray); + + int binIndex = (NUM_BINS-1) - (int) Math.floor(((1.0-af0)*NUM_BINS)); + // deal with round-off issue: low-AC sites with large samples can have AF rounded down to 0.000 + if (binIndex < 0) + binIndex = 0; +// System.out.format("Pre:%4.4f %d\n",af0, binIndex); + preSampleSelectionHistogram[binIndex]++; + numTotalSites++; + + // now process VC subsetted to samples of interest + if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + return; + + //System.out.format("Post:%4.4f %d\n",af0, binIndex); + postSampleSelectionHistogram[binIndex]++; + numSampleSelectedSites++; + + // create bare-bones event and log in corresponding bin + // attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes + GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel()); + + binnedEventArray[binIndex].add(event); + + } + + public ArrayList selectValidationSites(int numValidationSites) { + // number of sites to choose at random for each frequency bin = #desired validation sites/# total sites * #sites in original bin + int[] sitesToChoosePerBin = new int[NUM_BINS]; + int totalSites = 0; + for (int k=0; k < NUM_BINS; k++) { + int sites = (int)Math.round((double)numValidationSites * preSampleSelectionHistogram[k]/ (double)numTotalSites); + sitesToChoosePerBin[k] = sites; + totalSites += sites; + } + + // deal with rounding artifacts + while (totalSites > numValidationSites) { + // take off one from randomly selected bin + int k= GenomeAnalysisEngine.getRandomGenerator().nextInt(NUM_BINS); + sitesToChoosePerBin[k]--; + totalSites--; + } + while (totalSites < numValidationSites) { + // take off one from randomly selected bin + int k= GenomeAnalysisEngine.getRandomGenerator().nextInt( NUM_BINS); + sitesToChoosePerBin[k]++; + totalSites++; + } + + if (DEBUG) { + System.out.println("sitesToChoosePerBin:"); + for (int k=0; k < NUM_BINS; k++) + System.out.format("%d ", sitesToChoosePerBin[k]); + System.out.println(); + + System.out.println("preSampleSelectionHistogram:"); + for (int k=0; k < NUM_BINS; k++) + System.out.format("%d ", preSampleSelectionHistogram[k]); + System.out.println(); + + System.out.println("postSampleSelectionHistogram:"); + for (int k=0; k < NUM_BINS; k++) + System.out.format("%d ", postSampleSelectionHistogram[k]); + System.out.println(); + + } + + // take randomly sitesToChoosePerBin[k] elements from each bin + ArrayList selectedEvents = new ArrayList(); + + for (int k=0; k < NUM_BINS; k++) { + selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray[k], sitesToChoosePerBin[k])); + } + + Collections.sort(selectedEvents); + + // now convert to VC + ArrayList selectedSites = new ArrayList(); + for (GenomeEvent event : selectedEvents) + selectedSites.add(event.createVariantContextFromEvent()); + + return selectedSites; + + } + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java new file mode 100644 index 000000000..35b8893c2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/NullSampleSelector.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.TreeSet; + + +public class NullSampleSelector extends SampleSelector{ + public NullSampleSelector(TreeSet sm) { + super(sm); + } + + public VariantContext subsetSiteToSamples(VariantContext vc) { + return vc; + + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java new file mode 100644 index 000000000..e75fcef5d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/SampleSelector.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.TreeSet; + + +public abstract class SampleSelector implements Cloneable { + + TreeSet samples; + protected SampleSelector(TreeSet sm) { + samples = new TreeSet(sm); + } + + protected abstract VariantContext subsetSiteToSamples(VariantContext vc); + + +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java new file mode 100644 index 000000000..506a6b2e4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/UniformSamplingFrequencySelector.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + +public class UniformSamplingFrequencySelector extends FrequencyModeSelector { + private ArrayList binnedEventArray; + + public UniformSamplingFrequencySelector(GenomeLocParser parser) { + super(parser); + binnedEventArray = new ArrayList(); + + } + + public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) { + HashMap attributes = new HashMap(); + + + if (vc.hasGenotypes() && !IGNORE_GENOTYPES) { + // recompute AF,AC,AN based on genotypes: + VariantContextUtils.calculateChromosomeCounts(vc, attributes, false); + if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + return; + } else { + if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) { + int ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); + if (ac == 0) return; // site not polymorphic + } + else + return; + + } + // create bare-bones event and log in corresponding bin + // attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes + GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel()); + binnedEventArray.add(event); + + } + + public ArrayList selectValidationSites(int numValidationSites) { + + // take randomly sitesToChoosePerBin[k] elements from each bin + ArrayList selectedEvents = new ArrayList(); + + selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray, numValidationSites)); + + Collections.sort(selectedEvents); + + // now convert to VC + ArrayList selectedSites = new ArrayList(); + for (GenomeEvent event : selectedEvents) + selectedSites.add(event.createVariantContextFromEvent()); + + return selectedSites; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java new file mode 100644 index 000000000..10d2f639c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/ValidationSiteSelectorWalker.java @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.io.File; +import java.util.*; + + +/** + * Randomly selects VCF records according to specified options. + * + *

+ * ValidationSiteSelectorWalker is intended for use in experiments where we sample data randomly from a set of variants, for example + * in order to choose sites for a follow-up validation study. + * + * Sites are selected randomly but within certain restrictions. There are two main sources of restrictions + * a) Sample restrictions. A user can specify a set of samples, and we will only consider sites which are polymorphic within such given sample subset. + * These sample restrictions can be given as a set of individual samples, a text file (each line containing a sample name), or a regular expression. + * A user can additionally specify whether samples will be considered based on their genotypes (a non-reference genotype means that such sample is polymorphic in that variant, + * and hence that variant will be considered for inclusion in set), or based on their PLs. + * b) A user can additionally specify a sampling method based on allele frequency. Two sampling methods are currently supported. + * 1. Uniform sampling will just sample uniformly from variants polymorphic in selected samples. + * 2. Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set. + * + * User can additionally restrict output to a particular type of variant (SNP, Indel, etc.) + * + *

Input

+ *

+ * One or more variant sets to choose from. + *

+ * + *

Output

+ *

+ * A sites-only VCF with the desired number of randomly selected sites. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidationSiteSelectorWalker \
+ *   --variant input1.vcf \
+ *   --variant input2.vcf \
+ *   -sn NA12878 \
+ *   -o output.vcf \
+ *   --numValidationSites 200   \
+ *   -sampleMode  POLY_BASED_ON_GT \
+ *   -freqMode KEEP_AF_SPECTRUM
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ValidationSiteSelectorWalker \
+ *   --variant:foo input1.vcf \
+ *   --variant:bar input2.vcf \
+ *   --numValidationSites 200 \
+ *   -sf samples.txt \
+ *   -o output.vcf \
+ *   -sampleMode  POLY_BASED_ON_GT \
+  *   -freqMode UNIFORM
+ *   -selectType INDEL
+ * 
+ * + */ +public class ValidationSiteSelectorWalker extends RodWalker { + + public enum AF_COMPUTATION_MODE { + KEEP_AF_SPECTRUM, + UNIFORM + } + + public enum SAMPLE_SELECTION_MODE { + NONE, + POLY_BASED_ON_GT, + POLY_BASED_ON_GL + } + + @Input(fullName="variant", shortName = "V", doc="Input VCF file, can be specified multiple times", required=true) + public List> variants; + + @Output(doc="File to which variants should be written",required=true) + protected VCFWriter vcfWriter = null; + + @Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false) + public Set sampleNames = new HashSet(0); + + @Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false) + public Set sampleExpressions ; + + @Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false) + public Set sampleFiles; + + @Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false) + private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE; + + @Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true) + private int numValidationSites; + + @Argument(fullName="includeFilteredSites", shortName="ifs", doc="If true, will include filtered sites in set to choose variants from", required=false) + private boolean INCLUDE_FILTERED_SITES = false; + + @Argument(fullName="ignoreGenotypes", shortName="ignoreGenotypes", doc="If true, will ignore genotypes in VCF, will take AC,AF from annotations and will make no sample selection", required=false) + private boolean IGNORE_GENOTYPES = false; + + @Argument(fullName="ignorePolymorphicStatus", shortName="ignorePolymorphicStatus", doc="If true, will ignore polymorphic status in VCF, and will take VCF record directly without pre-selection", required=false) + private boolean IGNORE_POLYMORPHIC = false; + + @Hidden + @Argument(fullName="numFrequencyBins", shortName="numBins", doc="Number of frequency bins if we're to match AF distribution", required=false) + private int numFrequencyBins = 20; + + /** + * This argument selects allele frequency selection mode: + * KEEP_AF_SPECTRUM will choose variants so that the resulting allele frequency spectrum matches as closely as possible the input set + * UNIFORM will choose variants uniformly without regard to their allele frequency. + * + */ + @Argument(fullName="frequencySelectionMode", shortName="freqMode", doc="Allele Frequency selection mode", required=false) + private AF_COMPUTATION_MODE freqMode = AF_COMPUTATION_MODE.KEEP_AF_SPECTRUM; + + /** + * This argument selects particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. + * When specified one or more times, a particular type of variant is selected. + * + */ + @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) + private List TYPES_TO_INCLUDE = new ArrayList(); + + + private TreeSet samples = new TreeSet(); + SampleSelector sampleSelector = null; + FrequencyModeSelector frequencyModeSelector = null; + private ArrayList selectedTypes = new ArrayList(); + + public void initialize() { + // Get list of samples to include in the output + Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit()); + TreeSet vcfSamples = new TreeSet(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE)); + + Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); + Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); + + // first, add any requested samples + samples.addAll(samplesFromFile); + samples.addAll(samplesFromExpressions); + samples.addAll(sampleNames); + + // if none were requested, we want all of them + if ( samples.isEmpty() ) { + samples.addAll(vcfSamples); + + } + + sampleSelector = getSampleSelectorObject(sampleMode, samples); + + // initialize frequency mode selector + frequencyModeSelector = getFrequencyModeSelectorObject(freqMode, getToolkit().getGenomeLocParser()); + + // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include + if (TYPES_TO_INCLUDE.isEmpty()) { + + for (VariantContext.Type t : VariantContext.Type.values()) + selectedTypes.add(t); + + } + else { + for (VariantContext.Type t : TYPES_TO_INCLUDE) + selectedTypes.add(t); + + } + + Set headerLines = new HashSet(); + headerLines.add(new VCFHeaderLine("source", "ValidationSiteSelector")); + vcfWriter.writeHeader(new VCFHeader(headerLines)); + + } + + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + Collection vcs = tracker.getValues(variants, context.getLocation()); + + if ( vcs == null || vcs.size() == 0) { + return 0; + } + + + for (VariantContext vc : vcs) { + if (!selectedTypes.contains(vc.getType())) + continue; + + // skip if site isn't polymorphic and if user didn't request to ignore polymorphic status + if (!vc.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC) + continue; + + if (!INCLUDE_FILTERED_SITES && vc.filtersWereApplied() && vc.isFiltered()) + continue; + + + // do anything required by frequency selector before we select for samples + VariantContext subVC; + if (samples.isEmpty()) + subVC = vc; + else + subVC = sampleSelector.subsetSiteToSamples(vc); + frequencyModeSelector.logCurrentSiteData(vc, subVC, IGNORE_GENOTYPES, IGNORE_POLYMORPHIC); + } + return 1; + } + + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer value, Integer sum) { return value + sum; } + + public void onTraversalDone(Integer result) { + logger.info("Outputting validation sites..."); + ArrayList selectedSites = frequencyModeSelector.selectValidationSites(numValidationSites); + + for (VariantContext vc : selectedSites) { + vcfWriter.add(vc); + } + logger.info(result + " records processed."); + + } + + private SampleSelector getSampleSelectorObject(SAMPLE_SELECTION_MODE sampleMode, TreeSet samples) { + SampleSelector sm; + switch ( sampleMode ) { + case POLY_BASED_ON_GL: + sm = new GLBasedSampleSelector(samples); + break; + case POLY_BASED_ON_GT: + sm = new GTBasedSampleSelector(samples); + break; + case NONE: + sm = new NullSampleSelector(samples); + break; + default: + throw new IllegalArgumentException("Unsupported Sample Selection Mode: " + sampleMode); + } + + return sm; + } + + private FrequencyModeSelector getFrequencyModeSelectorObject (AF_COMPUTATION_MODE freqMode, GenomeLocParser parser) { + FrequencyModeSelector fm; + + switch (freqMode) { + case KEEP_AF_SPECTRUM: + fm = new KeepAFSpectrumFrequencySelector(numFrequencyBins, parser); + break; + case UNIFORM: + fm = new UniformSamplingFrequencySelector(parser); + break; + default: throw new IllegalArgumentException("Unexpected Frequency Selection Mode: "+ freqMode); + + } + return fm; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java index c04ca8592..a86d4781f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqFeature.java @@ -6,8 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.List; +import java.util.*; /** * the ref seq feature @@ -111,6 +110,34 @@ public class RefSeqFeature implements Transcript, Feature { return overlapString.toString(); } + ArrayList exonInRefOrderCache = null; + + public Integer getSortedOverlapInteger(GenomeLoc position) { + int exonNo = -1; + ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); + if ( exonInRefOrderCache == null ) { + Collections.sort(exonsInReferenceOrder); + } + exonInRefOrderCache = exonsInReferenceOrder; + for ( GenomeLoc exon : exonsInReferenceOrder ) { + if ( exon.overlapsP(position) ) { + return ++exonNo; + } + ++exonNo; + } + + return -1; + } + + public GenomeLoc getSortedExonLoc(int offset) { + ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); + if ( exonInRefOrderCache == null ) { + Collections.sort(exonsInReferenceOrder); + } + exonInRefOrderCache = exonsInReferenceOrder; + return exonsInReferenceOrder.get(offset); + } + /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ public boolean overlapsP (GenomeLoc that) { return getLocation().overlapsP(that); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java index 2557d70bb..aa6d7d345 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java @@ -88,7 +88,6 @@ public class TableCodec implements ReferenceDependentFeatureCodec { try { boolean isFirst = true; while ((line = reader.readLine()) != null) { - System.out.println(line); if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) { throw new UserException.MalformedFile("TableCodec file does not have a header"); }