Moving ValidationSiteSelector to validation package in public under my ownership. JunctionGenotyper added and modified several times, this commit is due to merging conflix fixes.

This commit is contained in:
Christopher Hartl 2011-12-19 10:57:28 -05:00
parent 3642a73c07
commit 69661da37d
12 changed files with 894 additions and 6 deletions

View File

@ -776,9 +776,9 @@ public class UnifiedGenotyperEngine {
*
* @return genotypes
*/
public GenotypesContext assignGenotypes(final VariantContext vc,
final boolean[] allelesToUse,
final List<Allele> newAlleles) {
public static GenotypesContext assignGenotypes(final VariantContext vc,
final boolean[] allelesToUse,
final List<Allele> newAlleles) {
// the no-called genotypes
final GenotypesContext GLs = vc.getGenotypes();

View File

@ -0,0 +1,47 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.ArrayList;
import java.util.HashMap;
public abstract class FrequencyModeSelector implements Cloneable{
protected GenomeLocParser parser;
protected FrequencyModeSelector(GenomeLocParser parser) {
this.parser = parser;
}
protected void logCurrentSiteData(VariantContext vc, VariantContext subVC) {
logCurrentSiteData(vc, subVC, false, false);
}
protected abstract void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC);
protected abstract ArrayList<VariantContext> selectValidationSites(int numValidationSites);
}

View File

@ -0,0 +1,43 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.TreeSet;
public class GLBasedSampleSelector extends SampleSelector {
public GLBasedSampleSelector(TreeSet<String> sm) {
super(sm);
}
public VariantContext subsetSiteToSamples(VariantContext vc) {
/* todo - Look at sample array, and create a new vc with samples for which GL's indicate they should be included.
For example, include all samples (and corresponding genotypes) whose GL's are such that argmax(GL) = HET or HOMVAR. */
throw new ReviewedStingException("GLBasedSampleSelector not implemented yet!");
//return true;
}
}

View File

@ -0,0 +1,51 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
public class GTBasedSampleSelector extends SampleSelector{
public GTBasedSampleSelector(TreeSet<String> sm) {
super(sm);
}
public VariantContext subsetSiteToSamples(VariantContext vc) {
// Super class already defined initialization which filled data structure "samples" with desired samples.
// We only need to check if current vc if polymorphic in that set of samples
if ( samples == null || samples.isEmpty() )
return vc;
return vc.subContextFromSamples(samples, vc.getAlleles());
}
}

View File

@ -0,0 +1,74 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.util.HashMap;
import java.util.List;
public class GenomeEvent implements Comparable {
final protected GenomeLoc loc;
/** A set of the alleles segregating in this context */
final protected List<Allele> alleles;
final protected Byte refBase;
// final protected HashMap<String, Object> attributes;
public GenomeEvent(GenomeLocParser parser, final String contig, final int start, final int stop, final List<Allele> alleles, HashMap<String, Object> attributes,
byte base) {
this.loc = parser.createGenomeLoc(contig, start, stop);
this.alleles = alleles;
this.refBase = base;
// this.attributes = attributes;
}
// Routine to compare two variant contexts (useful to sort collections of vc's).
// By default, we want to sort first by contig, then by start location
public GenomeLoc getGenomeLoc() {
return loc;
}
public int compareTo(final Object o) {
if (!(o instanceof GenomeEvent))
throw new ReviewedStingException("BUG: comparing variant context with non-VC object");
GenomeEvent otherEvent = (GenomeEvent)o;
return loc.compareTo(otherEvent.getGenomeLoc());
}
public VariantContext createVariantContextFromEvent() {
return new VariantContextBuilder("event", loc.getContig(), loc.getStart(), loc.getStop(), alleles)
.log10PError(0.0).referenceBaseForIndel(refBase).make();
}
}

View File

@ -0,0 +1,185 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector {
private static final boolean DEBUG = true;
private int NUM_BINS = 20;
private int[] preSampleSelectionHistogram;
private int numTotalSites = 0;
private int[] postSampleSelectionHistogram;
private int numSampleSelectedSites = 0;
private ArrayList<GenomeEvent>[] binnedEventArray;
public KeepAFSpectrumFrequencySelector(int numBins, GenomeLocParser parser) {
super(parser);
NUM_BINS = numBins;
// initialize arrays dependent on NUM_BINS
binnedEventArray = new ArrayList[NUM_BINS];
for (int k=0; k < NUM_BINS; k++)
binnedEventArray[k] = new ArrayList<GenomeEvent>();
preSampleSelectionHistogram = new int[NUM_BINS];
postSampleSelectionHistogram = new int[NUM_BINS];
}
public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
// this method is called for every variant of a selected type, regardless of whether it will be selectable or not
// get AC,AF,AN attributes from vc
HashMap<String, Object> attributes = new HashMap<String, Object>();
double[] afArray = null;
if (vc.hasGenotypes() && !IGNORE_GENOTYPES) {
// recompute AF,AC,AN based on genotypes:
// todo - - maybe too inefficient??
VariantContextUtils.calculateChromosomeCounts(vc, attributes, false);
afArray = new double[] {Double.valueOf((String)attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY))};
} else {
// sites-only vc or we explicitly tell to ignore genotypes; we trust the AF field if present
if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
String afo = vc.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
afArray = new double[afs.length];
for (int k=0; k < afArray.length; k++)
afArray[k] = Double.valueOf(afs[k]);
}
else
afArray = new double[] {Double.valueOf(afo)};
}
}
if (afArray == null )
return;
double af0 = MathUtils.arrayMax(afArray);
int binIndex = (NUM_BINS-1) - (int) Math.floor(((1.0-af0)*NUM_BINS));
// deal with round-off issue: low-AC sites with large samples can have AF rounded down to 0.000
if (binIndex < 0)
binIndex = 0;
// System.out.format("Pre:%4.4f %d\n",af0, binIndex);
preSampleSelectionHistogram[binIndex]++;
numTotalSites++;
// now process VC subsetted to samples of interest
if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
return;
//System.out.format("Post:%4.4f %d\n",af0, binIndex);
postSampleSelectionHistogram[binIndex]++;
numSampleSelectedSites++;
// create bare-bones event and log in corresponding bin
// attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes
GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel());
binnedEventArray[binIndex].add(event);
}
public ArrayList<VariantContext> selectValidationSites(int numValidationSites) {
// number of sites to choose at random for each frequency bin = #desired validation sites/# total sites * #sites in original bin
int[] sitesToChoosePerBin = new int[NUM_BINS];
int totalSites = 0;
for (int k=0; k < NUM_BINS; k++) {
int sites = (int)Math.round((double)numValidationSites * preSampleSelectionHistogram[k]/ (double)numTotalSites);
sitesToChoosePerBin[k] = sites;
totalSites += sites;
}
// deal with rounding artifacts
while (totalSites > numValidationSites) {
// take off one from randomly selected bin
int k= GenomeAnalysisEngine.getRandomGenerator().nextInt(NUM_BINS);
sitesToChoosePerBin[k]--;
totalSites--;
}
while (totalSites < numValidationSites) {
// take off one from randomly selected bin
int k= GenomeAnalysisEngine.getRandomGenerator().nextInt( NUM_BINS);
sitesToChoosePerBin[k]++;
totalSites++;
}
if (DEBUG) {
System.out.println("sitesToChoosePerBin:");
for (int k=0; k < NUM_BINS; k++)
System.out.format("%d ", sitesToChoosePerBin[k]);
System.out.println();
System.out.println("preSampleSelectionHistogram:");
for (int k=0; k < NUM_BINS; k++)
System.out.format("%d ", preSampleSelectionHistogram[k]);
System.out.println();
System.out.println("postSampleSelectionHistogram:");
for (int k=0; k < NUM_BINS; k++)
System.out.format("%d ", postSampleSelectionHistogram[k]);
System.out.println();
}
// take randomly sitesToChoosePerBin[k] elements from each bin
ArrayList<GenomeEvent> selectedEvents = new ArrayList<GenomeEvent>();
for (int k=0; k < NUM_BINS; k++) {
selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray[k], sitesToChoosePerBin[k]));
}
Collections.sort(selectedEvents);
// now convert to VC
ArrayList<VariantContext> selectedSites = new ArrayList<VariantContext>();
for (GenomeEvent event : selectedEvents)
selectedSites.add(event.createVariantContextFromEvent());
return selectedSites;
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.TreeSet;
public class NullSampleSelector extends SampleSelector{
public NullSampleSelector(TreeSet<String> sm) {
super(sm);
}
public VariantContext subsetSiteToSamples(VariantContext vc) {
return vc;
}
}

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.TreeSet;
public abstract class SampleSelector implements Cloneable {
TreeSet<String> samples;
protected SampleSelector(TreeSet<String> sm) {
samples = new TreeSet<String>(sm);
}
protected abstract VariantContext subsetSiteToSamples(VariantContext vc);
}

View File

@ -0,0 +1,86 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
public class UniformSamplingFrequencySelector extends FrequencyModeSelector {
private ArrayList<GenomeEvent> binnedEventArray;
public UniformSamplingFrequencySelector(GenomeLocParser parser) {
super(parser);
binnedEventArray = new ArrayList<GenomeEvent>();
}
public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
HashMap<String, Object> attributes = new HashMap<String, Object>();
if (vc.hasGenotypes() && !IGNORE_GENOTYPES) {
// recompute AF,AC,AN based on genotypes:
VariantContextUtils.calculateChromosomeCounts(vc, attributes, false);
if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
return;
} else {
if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) {
int ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
if (ac == 0) return; // site not polymorphic
}
else
return;
}
// create bare-bones event and log in corresponding bin
// attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes
GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel());
binnedEventArray.add(event);
}
public ArrayList<VariantContext> selectValidationSites(int numValidationSites) {
// take randomly sitesToChoosePerBin[k] elements from each bin
ArrayList<GenomeEvent> selectedEvents = new ArrayList<GenomeEvent>();
selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray, numValidationSites));
Collections.sort(selectedEvents);
// now convert to VC
ArrayList<VariantContext> selectedSites = new ArrayList<VariantContext>();
for (GenomeEvent event : selectedEvents)
selectedSites.add(event.createVariantContextFromEvent());
return selectedSites;
}
}

View File

@ -0,0 +1,296 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.io.File;
import java.util.*;
/**
* Randomly selects VCF records according to specified options.
*
* <p>
* ValidationSiteSelectorWalker is intended for use in experiments where we sample data randomly from a set of variants, for example
* in order to choose sites for a follow-up validation study.
*
* Sites are selected randomly but within certain restrictions. There are two main sources of restrictions
* a) Sample restrictions. A user can specify a set of samples, and we will only consider sites which are polymorphic within such given sample subset.
* These sample restrictions can be given as a set of individual samples, a text file (each line containing a sample name), or a regular expression.
* A user can additionally specify whether samples will be considered based on their genotypes (a non-reference genotype means that such sample is polymorphic in that variant,
* and hence that variant will be considered for inclusion in set), or based on their PLs.
* b) A user can additionally specify a sampling method based on allele frequency. Two sampling methods are currently supported.
* 1. Uniform sampling will just sample uniformly from variants polymorphic in selected samples.
* 2. Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set.
*
* User can additionally restrict output to a particular type of variant (SNP, Indel, etc.)
*
* <h2>Input</h2>
* <p>
* One or more variant sets to choose from.
* </p>
*
* <h2>Output</h2>
* <p>
* A sites-only VCF with the desired number of randomly selected sites.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidationSiteSelectorWalker \
* --variant input1.vcf \
* --variant input2.vcf \
* -sn NA12878 \
* -o output.vcf \
* --numValidationSites 200 \
* -sampleMode POLY_BASED_ON_GT \
* -freqMode KEEP_AF_SPECTRUM
*
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidationSiteSelectorWalker \
* --variant:foo input1.vcf \
* --variant:bar input2.vcf \
* --numValidationSites 200 \
* -sf samples.txt \
* -o output.vcf \
* -sampleMode POLY_BASED_ON_GT \
* -freqMode UNIFORM
* -selectType INDEL
* </pre>
*
*/
public class ValidationSiteSelectorWalker extends RodWalker<Integer, Integer> {
public enum AF_COMPUTATION_MODE {
KEEP_AF_SPECTRUM,
UNIFORM
}
public enum SAMPLE_SELECTION_MODE {
NONE,
POLY_BASED_ON_GT,
POLY_BASED_ON_GL
}
@Input(fullName="variant", shortName = "V", doc="Input VCF file, can be specified multiple times", required=true)
public List<RodBinding<VariantContext>> variants;
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
public Set<String> sampleNames = new HashSet<String>(0);
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
public Set<String> sampleExpressions ;
@Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
public Set<File> sampleFiles;
@Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false)
private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE;
@Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true)
private int numValidationSites;
@Argument(fullName="includeFilteredSites", shortName="ifs", doc="If true, will include filtered sites in set to choose variants from", required=false)
private boolean INCLUDE_FILTERED_SITES = false;
@Argument(fullName="ignoreGenotypes", shortName="ignoreGenotypes", doc="If true, will ignore genotypes in VCF, will take AC,AF from annotations and will make no sample selection", required=false)
private boolean IGNORE_GENOTYPES = false;
@Argument(fullName="ignorePolymorphicStatus", shortName="ignorePolymorphicStatus", doc="If true, will ignore polymorphic status in VCF, and will take VCF record directly without pre-selection", required=false)
private boolean IGNORE_POLYMORPHIC = false;
@Hidden
@Argument(fullName="numFrequencyBins", shortName="numBins", doc="Number of frequency bins if we're to match AF distribution", required=false)
private int numFrequencyBins = 20;
/**
* This argument selects allele frequency selection mode:
* KEEP_AF_SPECTRUM will choose variants so that the resulting allele frequency spectrum matches as closely as possible the input set
* UNIFORM will choose variants uniformly without regard to their allele frequency.
*
*/
@Argument(fullName="frequencySelectionMode", shortName="freqMode", doc="Allele Frequency selection mode", required=false)
private AF_COMPUTATION_MODE freqMode = AF_COMPUTATION_MODE.KEEP_AF_SPECTRUM;
/**
* This argument selects particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
* When specified one or more times, a particular type of variant is selected.
*
*/
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();
private TreeSet<String> samples = new TreeSet<String>();
SampleSelector sampleSelector = null;
FrequencyModeSelector frequencyModeSelector = null;
private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
public void initialize() {
// Get list of samples to include in the output
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit());
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);
// first, add any requested samples
samples.addAll(samplesFromFile);
samples.addAll(samplesFromExpressions);
samples.addAll(sampleNames);
// if none were requested, we want all of them
if ( samples.isEmpty() ) {
samples.addAll(vcfSamples);
}
sampleSelector = getSampleSelectorObject(sampleMode, samples);
// initialize frequency mode selector
frequencyModeSelector = getFrequencyModeSelectorObject(freqMode, getToolkit().getGenomeLocParser());
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
if (TYPES_TO_INCLUDE.isEmpty()) {
for (VariantContext.Type t : VariantContext.Type.values())
selectedTypes.add(t);
}
else {
for (VariantContext.Type t : TYPES_TO_INCLUDE)
selectedTypes.add(t);
}
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
headerLines.add(new VCFHeaderLine("source", "ValidationSiteSelector"));
vcfWriter.writeHeader(new VCFHeader(headerLines));
}
@Override
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null )
return 0;
Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());
if ( vcs == null || vcs.size() == 0) {
return 0;
}
for (VariantContext vc : vcs) {
if (!selectedTypes.contains(vc.getType()))
continue;
// skip if site isn't polymorphic and if user didn't request to ignore polymorphic status
if (!vc.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
continue;
if (!INCLUDE_FILTERED_SITES && vc.filtersWereApplied() && vc.isFiltered())
continue;
// do anything required by frequency selector before we select for samples
VariantContext subVC;
if (samples.isEmpty())
subVC = vc;
else
subVC = sampleSelector.subsetSiteToSamples(vc);
frequencyModeSelector.logCurrentSiteData(vc, subVC, IGNORE_GENOTYPES, IGNORE_POLYMORPHIC);
}
return 1;
}
@Override
public Integer reduceInit() { return 0; }
@Override
public Integer reduce(Integer value, Integer sum) { return value + sum; }
public void onTraversalDone(Integer result) {
logger.info("Outputting validation sites...");
ArrayList<VariantContext> selectedSites = frequencyModeSelector.selectValidationSites(numValidationSites);
for (VariantContext vc : selectedSites) {
vcfWriter.add(vc);
}
logger.info(result + " records processed.");
}
private SampleSelector getSampleSelectorObject(SAMPLE_SELECTION_MODE sampleMode, TreeSet<String> samples) {
SampleSelector sm;
switch ( sampleMode ) {
case POLY_BASED_ON_GL:
sm = new GLBasedSampleSelector(samples);
break;
case POLY_BASED_ON_GT:
sm = new GTBasedSampleSelector(samples);
break;
case NONE:
sm = new NullSampleSelector(samples);
break;
default:
throw new IllegalArgumentException("Unsupported Sample Selection Mode: " + sampleMode);
}
return sm;
}
private FrequencyModeSelector getFrequencyModeSelectorObject (AF_COMPUTATION_MODE freqMode, GenomeLocParser parser) {
FrequencyModeSelector fm;
switch (freqMode) {
case KEEP_AF_SPECTRUM:
fm = new KeepAFSpectrumFrequencySelector(numFrequencyBins, parser);
break;
case UNIFORM:
fm = new UniformSamplingFrequencySelector(parser);
break;
default: throw new IllegalArgumentException("Unexpected Frequency Selection Mode: "+ freqMode);
}
return fm;
}
}

View File

@ -6,8 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
/**
* the ref seq feature
@ -111,6 +110,34 @@ public class RefSeqFeature implements Transcript, Feature {
return overlapString.toString();
}
ArrayList<GenomeLoc> exonInRefOrderCache = null;
public Integer getSortedOverlapInteger(GenomeLoc position) {
int exonNo = -1;
ArrayList<GenomeLoc> exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList<GenomeLoc>(exons);
if ( exonInRefOrderCache == null ) {
Collections.sort(exonsInReferenceOrder);
}
exonInRefOrderCache = exonsInReferenceOrder;
for ( GenomeLoc exon : exonsInReferenceOrder ) {
if ( exon.overlapsP(position) ) {
return ++exonNo;
}
++exonNo;
}
return -1;
}
public GenomeLoc getSortedExonLoc(int offset) {
ArrayList<GenomeLoc> exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList<GenomeLoc>(exons);
if ( exonInRefOrderCache == null ) {
Collections.sort(exonsInReferenceOrder);
}
exonInRefOrderCache = exonsInReferenceOrder;
return exonsInReferenceOrder.get(offset);
}
/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
public boolean overlapsP (GenomeLoc that) {
return getLocation().overlapsP(that);

View File

@ -88,7 +88,6 @@ public class TableCodec implements ReferenceDependentFeatureCodec {
try {
boolean isFirst = true;
while ((line = reader.readLine()) != null) {
System.out.println(line);
if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) {
throw new UserException.MalformedFile("TableCodec file does not have a header");
}