Moving ValidationSiteSelector to validation package in public under my ownership. JunctionGenotyper added and modified several times, this commit is due to merging conflix fixes.
This commit is contained in:
parent
3642a73c07
commit
69661da37d
|
|
@ -776,9 +776,9 @@ public class UnifiedGenotyperEngine {
|
||||||
*
|
*
|
||||||
* @return genotypes
|
* @return genotypes
|
||||||
*/
|
*/
|
||||||
public GenotypesContext assignGenotypes(final VariantContext vc,
|
public static GenotypesContext assignGenotypes(final VariantContext vc,
|
||||||
final boolean[] allelesToUse,
|
final boolean[] allelesToUse,
|
||||||
final List<Allele> newAlleles) {
|
final List<Allele> newAlleles) {
|
||||||
|
|
||||||
// the no-called genotypes
|
// the no-called genotypes
|
||||||
final GenotypesContext GLs = vc.getGenotypes();
|
final GenotypesContext GLs = vc.getGenotypes();
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,47 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
public abstract class FrequencyModeSelector implements Cloneable{
|
||||||
|
|
||||||
|
protected GenomeLocParser parser;
|
||||||
|
|
||||||
|
protected FrequencyModeSelector(GenomeLocParser parser) {
|
||||||
|
this.parser = parser;
|
||||||
|
}
|
||||||
|
protected void logCurrentSiteData(VariantContext vc, VariantContext subVC) {
|
||||||
|
logCurrentSiteData(vc, subVC, false, false);
|
||||||
|
}
|
||||||
|
protected abstract void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC);
|
||||||
|
protected abstract ArrayList<VariantContext> selectValidationSites(int numValidationSites);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
|
||||||
|
public class GLBasedSampleSelector extends SampleSelector {
|
||||||
|
public GLBasedSampleSelector(TreeSet<String> sm) {
|
||||||
|
super(sm);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantContext subsetSiteToSamples(VariantContext vc) {
|
||||||
|
/* todo - Look at sample array, and create a new vc with samples for which GL's indicate they should be included.
|
||||||
|
For example, include all samples (and corresponding genotypes) whose GL's are such that argmax(GL) = HET or HOMVAR. */
|
||||||
|
throw new ReviewedStingException("GLBasedSampleSelector not implemented yet!");
|
||||||
|
//return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,51 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
|
||||||
|
public class GTBasedSampleSelector extends SampleSelector{
|
||||||
|
public GTBasedSampleSelector(TreeSet<String> sm) {
|
||||||
|
super(sm);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantContext subsetSiteToSamples(VariantContext vc) {
|
||||||
|
// Super class already defined initialization which filled data structure "samples" with desired samples.
|
||||||
|
// We only need to check if current vc if polymorphic in that set of samples
|
||||||
|
|
||||||
|
if ( samples == null || samples.isEmpty() )
|
||||||
|
return vc;
|
||||||
|
|
||||||
|
return vc.subContextFromSamples(samples, vc.getAlleles());
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,74 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
|
public class GenomeEvent implements Comparable {
|
||||||
|
final protected GenomeLoc loc;
|
||||||
|
/** A set of the alleles segregating in this context */
|
||||||
|
final protected List<Allele> alleles;
|
||||||
|
final protected Byte refBase;
|
||||||
|
// final protected HashMap<String, Object> attributes;
|
||||||
|
|
||||||
|
public GenomeEvent(GenomeLocParser parser, final String contig, final int start, final int stop, final List<Allele> alleles, HashMap<String, Object> attributes,
|
||||||
|
byte base) {
|
||||||
|
this.loc = parser.createGenomeLoc(contig, start, stop);
|
||||||
|
this.alleles = alleles;
|
||||||
|
this.refBase = base;
|
||||||
|
// this.attributes = attributes;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Routine to compare two variant contexts (useful to sort collections of vc's).
|
||||||
|
// By default, we want to sort first by contig, then by start location
|
||||||
|
|
||||||
|
public GenomeLoc getGenomeLoc() {
|
||||||
|
return loc;
|
||||||
|
}
|
||||||
|
public int compareTo(final Object o) {
|
||||||
|
if (!(o instanceof GenomeEvent))
|
||||||
|
throw new ReviewedStingException("BUG: comparing variant context with non-VC object");
|
||||||
|
|
||||||
|
GenomeEvent otherEvent = (GenomeEvent)o;
|
||||||
|
|
||||||
|
return loc.compareTo(otherEvent.getGenomeLoc());
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantContext createVariantContextFromEvent() {
|
||||||
|
return new VariantContextBuilder("event", loc.getContig(), loc.getStart(), loc.getStop(), alleles)
|
||||||
|
.log10PError(0.0).referenceBaseForIndel(refBase).make();
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,185 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
|
||||||
|
public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector {
|
||||||
|
|
||||||
|
private static final boolean DEBUG = true;
|
||||||
|
|
||||||
|
private int NUM_BINS = 20;
|
||||||
|
|
||||||
|
private int[] preSampleSelectionHistogram;
|
||||||
|
private int numTotalSites = 0;
|
||||||
|
private int[] postSampleSelectionHistogram;
|
||||||
|
private int numSampleSelectedSites = 0;
|
||||||
|
private ArrayList<GenomeEvent>[] binnedEventArray;
|
||||||
|
|
||||||
|
public KeepAFSpectrumFrequencySelector(int numBins, GenomeLocParser parser) {
|
||||||
|
super(parser);
|
||||||
|
NUM_BINS = numBins;
|
||||||
|
// initialize arrays dependent on NUM_BINS
|
||||||
|
binnedEventArray = new ArrayList[NUM_BINS];
|
||||||
|
|
||||||
|
for (int k=0; k < NUM_BINS; k++)
|
||||||
|
binnedEventArray[k] = new ArrayList<GenomeEvent>();
|
||||||
|
|
||||||
|
preSampleSelectionHistogram = new int[NUM_BINS];
|
||||||
|
postSampleSelectionHistogram = new int[NUM_BINS];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
|
||||||
|
|
||||||
|
// this method is called for every variant of a selected type, regardless of whether it will be selectable or not
|
||||||
|
// get AC,AF,AN attributes from vc
|
||||||
|
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
double[] afArray = null;
|
||||||
|
|
||||||
|
if (vc.hasGenotypes() && !IGNORE_GENOTYPES) {
|
||||||
|
// recompute AF,AC,AN based on genotypes:
|
||||||
|
// todo - - maybe too inefficient??
|
||||||
|
VariantContextUtils.calculateChromosomeCounts(vc, attributes, false);
|
||||||
|
afArray = new double[] {Double.valueOf((String)attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY))};
|
||||||
|
} else {
|
||||||
|
// sites-only vc or we explicitly tell to ignore genotypes; we trust the AF field if present
|
||||||
|
if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
|
||||||
|
String afo = vc.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
|
||||||
|
|
||||||
|
if (afo.contains(",")) {
|
||||||
|
String[] afs = afo.split(",");
|
||||||
|
afs[0] = afs[0].substring(1,afs[0].length());
|
||||||
|
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
||||||
|
|
||||||
|
afArray = new double[afs.length];
|
||||||
|
|
||||||
|
for (int k=0; k < afArray.length; k++)
|
||||||
|
afArray[k] = Double.valueOf(afs[k]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
afArray = new double[] {Double.valueOf(afo)};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (afArray == null )
|
||||||
|
return;
|
||||||
|
|
||||||
|
double af0 = MathUtils.arrayMax(afArray);
|
||||||
|
|
||||||
|
int binIndex = (NUM_BINS-1) - (int) Math.floor(((1.0-af0)*NUM_BINS));
|
||||||
|
// deal with round-off issue: low-AC sites with large samples can have AF rounded down to 0.000
|
||||||
|
if (binIndex < 0)
|
||||||
|
binIndex = 0;
|
||||||
|
// System.out.format("Pre:%4.4f %d\n",af0, binIndex);
|
||||||
|
preSampleSelectionHistogram[binIndex]++;
|
||||||
|
numTotalSites++;
|
||||||
|
|
||||||
|
// now process VC subsetted to samples of interest
|
||||||
|
if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
|
||||||
|
return;
|
||||||
|
|
||||||
|
//System.out.format("Post:%4.4f %d\n",af0, binIndex);
|
||||||
|
postSampleSelectionHistogram[binIndex]++;
|
||||||
|
numSampleSelectedSites++;
|
||||||
|
|
||||||
|
// create bare-bones event and log in corresponding bin
|
||||||
|
// attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes
|
||||||
|
GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel());
|
||||||
|
|
||||||
|
binnedEventArray[binIndex].add(event);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<VariantContext> selectValidationSites(int numValidationSites) {
|
||||||
|
// number of sites to choose at random for each frequency bin = #desired validation sites/# total sites * #sites in original bin
|
||||||
|
int[] sitesToChoosePerBin = new int[NUM_BINS];
|
||||||
|
int totalSites = 0;
|
||||||
|
for (int k=0; k < NUM_BINS; k++) {
|
||||||
|
int sites = (int)Math.round((double)numValidationSites * preSampleSelectionHistogram[k]/ (double)numTotalSites);
|
||||||
|
sitesToChoosePerBin[k] = sites;
|
||||||
|
totalSites += sites;
|
||||||
|
}
|
||||||
|
|
||||||
|
// deal with rounding artifacts
|
||||||
|
while (totalSites > numValidationSites) {
|
||||||
|
// take off one from randomly selected bin
|
||||||
|
int k= GenomeAnalysisEngine.getRandomGenerator().nextInt(NUM_BINS);
|
||||||
|
sitesToChoosePerBin[k]--;
|
||||||
|
totalSites--;
|
||||||
|
}
|
||||||
|
while (totalSites < numValidationSites) {
|
||||||
|
// take off one from randomly selected bin
|
||||||
|
int k= GenomeAnalysisEngine.getRandomGenerator().nextInt( NUM_BINS);
|
||||||
|
sitesToChoosePerBin[k]++;
|
||||||
|
totalSites++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (DEBUG) {
|
||||||
|
System.out.println("sitesToChoosePerBin:");
|
||||||
|
for (int k=0; k < NUM_BINS; k++)
|
||||||
|
System.out.format("%d ", sitesToChoosePerBin[k]);
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
System.out.println("preSampleSelectionHistogram:");
|
||||||
|
for (int k=0; k < NUM_BINS; k++)
|
||||||
|
System.out.format("%d ", preSampleSelectionHistogram[k]);
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
System.out.println("postSampleSelectionHistogram:");
|
||||||
|
for (int k=0; k < NUM_BINS; k++)
|
||||||
|
System.out.format("%d ", postSampleSelectionHistogram[k]);
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// take randomly sitesToChoosePerBin[k] elements from each bin
|
||||||
|
ArrayList<GenomeEvent> selectedEvents = new ArrayList<GenomeEvent>();
|
||||||
|
|
||||||
|
for (int k=0; k < NUM_BINS; k++) {
|
||||||
|
selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray[k], sitesToChoosePerBin[k]));
|
||||||
|
}
|
||||||
|
|
||||||
|
Collections.sort(selectedEvents);
|
||||||
|
|
||||||
|
// now convert to VC
|
||||||
|
ArrayList<VariantContext> selectedSites = new ArrayList<VariantContext>();
|
||||||
|
for (GenomeEvent event : selectedEvents)
|
||||||
|
selectedSites.add(event.createVariantContextFromEvent());
|
||||||
|
|
||||||
|
return selectedSites;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
|
||||||
|
public class NullSampleSelector extends SampleSelector{
|
||||||
|
public NullSampleSelector(TreeSet<String> sm) {
|
||||||
|
super(sm);
|
||||||
|
}
|
||||||
|
|
||||||
|
public VariantContext subsetSiteToSamples(VariantContext vc) {
|
||||||
|
return vc;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,40 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
|
||||||
|
public abstract class SampleSelector implements Cloneable {
|
||||||
|
|
||||||
|
TreeSet<String> samples;
|
||||||
|
protected SampleSelector(TreeSet<String> sm) {
|
||||||
|
samples = new TreeSet<String>(sm);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract VariantContext subsetSiteToSamples(VariantContext vc);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,86 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.MathUtils;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
public class UniformSamplingFrequencySelector extends FrequencyModeSelector {
|
||||||
|
private ArrayList<GenomeEvent> binnedEventArray;
|
||||||
|
|
||||||
|
public UniformSamplingFrequencySelector(GenomeLocParser parser) {
|
||||||
|
super(parser);
|
||||||
|
binnedEventArray = new ArrayList<GenomeEvent>();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void logCurrentSiteData(VariantContext vc, VariantContext subVC, boolean IGNORE_GENOTYPES, boolean IGNORE_POLYMORPHIC) {
|
||||||
|
HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||||
|
|
||||||
|
|
||||||
|
if (vc.hasGenotypes() && !IGNORE_GENOTYPES) {
|
||||||
|
// recompute AF,AC,AN based on genotypes:
|
||||||
|
VariantContextUtils.calculateChromosomeCounts(vc, attributes, false);
|
||||||
|
if (!subVC.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) ) {
|
||||||
|
int ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
|
||||||
|
if (ac == 0) return; // site not polymorphic
|
||||||
|
}
|
||||||
|
else
|
||||||
|
return;
|
||||||
|
|
||||||
|
}
|
||||||
|
// create bare-bones event and log in corresponding bin
|
||||||
|
// attributes contains AC,AF,AN pulled from original vc, and we keep them here and log in output file for bookkeeping purposes
|
||||||
|
GenomeEvent event = new GenomeEvent(parser, vc.getChr(), vc.getStart(), vc.getEnd(),vc.getAlleles(), attributes, vc.getReferenceBaseForIndel());
|
||||||
|
binnedEventArray.add(event);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayList<VariantContext> selectValidationSites(int numValidationSites) {
|
||||||
|
|
||||||
|
// take randomly sitesToChoosePerBin[k] elements from each bin
|
||||||
|
ArrayList<GenomeEvent> selectedEvents = new ArrayList<GenomeEvent>();
|
||||||
|
|
||||||
|
selectedEvents.addAll(MathUtils.randomSubset(binnedEventArray, numValidationSites));
|
||||||
|
|
||||||
|
Collections.sort(selectedEvents);
|
||||||
|
|
||||||
|
// now convert to VC
|
||||||
|
ArrayList<VariantContext> selectedSites = new ArrayList<VariantContext>();
|
||||||
|
for (GenomeEvent event : selectedEvents)
|
||||||
|
selectedSites.add(event.createVariantContextFromEvent());
|
||||||
|
|
||||||
|
return selectedSites;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,296 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.validation.validationsiteselector;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.*;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.SampleUtils;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Randomly selects VCF records according to specified options.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* ValidationSiteSelectorWalker is intended for use in experiments where we sample data randomly from a set of variants, for example
|
||||||
|
* in order to choose sites for a follow-up validation study.
|
||||||
|
*
|
||||||
|
* Sites are selected randomly but within certain restrictions. There are two main sources of restrictions
|
||||||
|
* a) Sample restrictions. A user can specify a set of samples, and we will only consider sites which are polymorphic within such given sample subset.
|
||||||
|
* These sample restrictions can be given as a set of individual samples, a text file (each line containing a sample name), or a regular expression.
|
||||||
|
* A user can additionally specify whether samples will be considered based on their genotypes (a non-reference genotype means that such sample is polymorphic in that variant,
|
||||||
|
* and hence that variant will be considered for inclusion in set), or based on their PLs.
|
||||||
|
* b) A user can additionally specify a sampling method based on allele frequency. Two sampling methods are currently supported.
|
||||||
|
* 1. Uniform sampling will just sample uniformly from variants polymorphic in selected samples.
|
||||||
|
* 2. Sampling based on Allele Frequency spectrum will ensure that output sites have the same AF distribution as the input set.
|
||||||
|
*
|
||||||
|
* User can additionally restrict output to a particular type of variant (SNP, Indel, etc.)
|
||||||
|
*
|
||||||
|
* <h2>Input</h2>
|
||||||
|
* <p>
|
||||||
|
* One or more variant sets to choose from.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Output</h2>
|
||||||
|
* <p>
|
||||||
|
* A sites-only VCF with the desired number of randomly selected sites.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <h2>Examples</h2>
|
||||||
|
* <pre>
|
||||||
|
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||||
|
* -R ref.fasta \
|
||||||
|
* -T ValidationSiteSelectorWalker \
|
||||||
|
* --variant input1.vcf \
|
||||||
|
* --variant input2.vcf \
|
||||||
|
* -sn NA12878 \
|
||||||
|
* -o output.vcf \
|
||||||
|
* --numValidationSites 200 \
|
||||||
|
* -sampleMode POLY_BASED_ON_GT \
|
||||||
|
* -freqMode KEEP_AF_SPECTRUM
|
||||||
|
*
|
||||||
|
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||||
|
* -R ref.fasta \
|
||||||
|
* -T ValidationSiteSelectorWalker \
|
||||||
|
* --variant:foo input1.vcf \
|
||||||
|
* --variant:bar input2.vcf \
|
||||||
|
* --numValidationSites 200 \
|
||||||
|
* -sf samples.txt \
|
||||||
|
* -o output.vcf \
|
||||||
|
* -sampleMode POLY_BASED_ON_GT \
|
||||||
|
* -freqMode UNIFORM
|
||||||
|
* -selectType INDEL
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class ValidationSiteSelectorWalker extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
|
public enum AF_COMPUTATION_MODE {
|
||||||
|
KEEP_AF_SPECTRUM,
|
||||||
|
UNIFORM
|
||||||
|
}
|
||||||
|
|
||||||
|
public enum SAMPLE_SELECTION_MODE {
|
||||||
|
NONE,
|
||||||
|
POLY_BASED_ON_GT,
|
||||||
|
POLY_BASED_ON_GL
|
||||||
|
}
|
||||||
|
|
||||||
|
@Input(fullName="variant", shortName = "V", doc="Input VCF file, can be specified multiple times", required=true)
|
||||||
|
public List<RodBinding<VariantContext>> variants;
|
||||||
|
|
||||||
|
@Output(doc="File to which variants should be written",required=true)
|
||||||
|
protected VCFWriter vcfWriter = null;
|
||||||
|
|
||||||
|
@Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
|
||||||
|
public Set<String> sampleNames = new HashSet<String>(0);
|
||||||
|
|
||||||
|
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
|
||||||
|
public Set<String> sampleExpressions ;
|
||||||
|
|
||||||
|
@Input(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
|
||||||
|
public Set<File> sampleFiles;
|
||||||
|
|
||||||
|
@Argument(fullName="sampleMode", shortName="sampleMode", doc="Sample selection mode", required=false)
|
||||||
|
private SAMPLE_SELECTION_MODE sampleMode = SAMPLE_SELECTION_MODE.NONE;
|
||||||
|
|
||||||
|
@Argument(fullName="numValidationSites", shortName="numSites", doc="Number of output validation sites", required=true)
|
||||||
|
private int numValidationSites;
|
||||||
|
|
||||||
|
@Argument(fullName="includeFilteredSites", shortName="ifs", doc="If true, will include filtered sites in set to choose variants from", required=false)
|
||||||
|
private boolean INCLUDE_FILTERED_SITES = false;
|
||||||
|
|
||||||
|
@Argument(fullName="ignoreGenotypes", shortName="ignoreGenotypes", doc="If true, will ignore genotypes in VCF, will take AC,AF from annotations and will make no sample selection", required=false)
|
||||||
|
private boolean IGNORE_GENOTYPES = false;
|
||||||
|
|
||||||
|
@Argument(fullName="ignorePolymorphicStatus", shortName="ignorePolymorphicStatus", doc="If true, will ignore polymorphic status in VCF, and will take VCF record directly without pre-selection", required=false)
|
||||||
|
private boolean IGNORE_POLYMORPHIC = false;
|
||||||
|
|
||||||
|
@Hidden
|
||||||
|
@Argument(fullName="numFrequencyBins", shortName="numBins", doc="Number of frequency bins if we're to match AF distribution", required=false)
|
||||||
|
private int numFrequencyBins = 20;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This argument selects allele frequency selection mode:
|
||||||
|
* KEEP_AF_SPECTRUM will choose variants so that the resulting allele frequency spectrum matches as closely as possible the input set
|
||||||
|
* UNIFORM will choose variants uniformly without regard to their allele frequency.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
@Argument(fullName="frequencySelectionMode", shortName="freqMode", doc="Allele Frequency selection mode", required=false)
|
||||||
|
private AF_COMPUTATION_MODE freqMode = AF_COMPUTATION_MODE.KEEP_AF_SPECTRUM;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This argument selects particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
|
||||||
|
* When specified one or more times, a particular type of variant is selected.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
@Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
|
||||||
|
private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();
|
||||||
|
|
||||||
|
|
||||||
|
private TreeSet<String> samples = new TreeSet<String>();
|
||||||
|
SampleSelector sampleSelector = null;
|
||||||
|
FrequencyModeSelector frequencyModeSelector = null;
|
||||||
|
private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
// Get list of samples to include in the output
|
||||||
|
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit());
|
||||||
|
TreeSet<String> vcfSamples = new TreeSet<String>(SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE));
|
||||||
|
|
||||||
|
Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
|
||||||
|
Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);
|
||||||
|
|
||||||
|
// first, add any requested samples
|
||||||
|
samples.addAll(samplesFromFile);
|
||||||
|
samples.addAll(samplesFromExpressions);
|
||||||
|
samples.addAll(sampleNames);
|
||||||
|
|
||||||
|
// if none were requested, we want all of them
|
||||||
|
if ( samples.isEmpty() ) {
|
||||||
|
samples.addAll(vcfSamples);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
sampleSelector = getSampleSelectorObject(sampleMode, samples);
|
||||||
|
|
||||||
|
// initialize frequency mode selector
|
||||||
|
frequencyModeSelector = getFrequencyModeSelectorObject(freqMode, getToolkit().getGenomeLocParser());
|
||||||
|
|
||||||
|
// if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
|
||||||
|
if (TYPES_TO_INCLUDE.isEmpty()) {
|
||||||
|
|
||||||
|
for (VariantContext.Type t : VariantContext.Type.values())
|
||||||
|
selectedTypes.add(t);
|
||||||
|
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
for (VariantContext.Type t : TYPES_TO_INCLUDE)
|
||||||
|
selectedTypes.add(t);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
|
||||||
|
headerLines.add(new VCFHeaderLine("source", "ValidationSiteSelector"));
|
||||||
|
vcfWriter.writeHeader(new VCFHeader(headerLines));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null )
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());
|
||||||
|
|
||||||
|
if ( vcs == null || vcs.size() == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (VariantContext vc : vcs) {
|
||||||
|
if (!selectedTypes.contains(vc.getType()))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// skip if site isn't polymorphic and if user didn't request to ignore polymorphic status
|
||||||
|
if (!vc.isPolymorphicInSamples() && !IGNORE_POLYMORPHIC)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (!INCLUDE_FILTERED_SITES && vc.filtersWereApplied() && vc.isFiltered())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
|
||||||
|
// do anything required by frequency selector before we select for samples
|
||||||
|
VariantContext subVC;
|
||||||
|
if (samples.isEmpty())
|
||||||
|
subVC = vc;
|
||||||
|
else
|
||||||
|
subVC = sampleSelector.subsetSiteToSamples(vc);
|
||||||
|
frequencyModeSelector.logCurrentSiteData(vc, subVC, IGNORE_GENOTYPES, IGNORE_POLYMORPHIC);
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integer reduceInit() { return 0; }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Integer reduce(Integer value, Integer sum) { return value + sum; }
|
||||||
|
|
||||||
|
public void onTraversalDone(Integer result) {
|
||||||
|
logger.info("Outputting validation sites...");
|
||||||
|
ArrayList<VariantContext> selectedSites = frequencyModeSelector.selectValidationSites(numValidationSites);
|
||||||
|
|
||||||
|
for (VariantContext vc : selectedSites) {
|
||||||
|
vcfWriter.add(vc);
|
||||||
|
}
|
||||||
|
logger.info(result + " records processed.");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private SampleSelector getSampleSelectorObject(SAMPLE_SELECTION_MODE sampleMode, TreeSet<String> samples) {
|
||||||
|
SampleSelector sm;
|
||||||
|
switch ( sampleMode ) {
|
||||||
|
case POLY_BASED_ON_GL:
|
||||||
|
sm = new GLBasedSampleSelector(samples);
|
||||||
|
break;
|
||||||
|
case POLY_BASED_ON_GT:
|
||||||
|
sm = new GTBasedSampleSelector(samples);
|
||||||
|
break;
|
||||||
|
case NONE:
|
||||||
|
sm = new NullSampleSelector(samples);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Unsupported Sample Selection Mode: " + sampleMode);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sm;
|
||||||
|
}
|
||||||
|
|
||||||
|
private FrequencyModeSelector getFrequencyModeSelectorObject (AF_COMPUTATION_MODE freqMode, GenomeLocParser parser) {
|
||||||
|
FrequencyModeSelector fm;
|
||||||
|
|
||||||
|
switch (freqMode) {
|
||||||
|
case KEEP_AF_SPECTRUM:
|
||||||
|
fm = new KeepAFSpectrumFrequencySelector(numFrequencyBins, parser);
|
||||||
|
break;
|
||||||
|
case UNIFORM:
|
||||||
|
fm = new UniformSamplingFrequencySelector(parser);
|
||||||
|
break;
|
||||||
|
default: throw new IllegalArgumentException("Unexpected Frequency Selection Mode: "+ freqMode);
|
||||||
|
|
||||||
|
}
|
||||||
|
return fm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -6,8 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* the ref seq feature
|
* the ref seq feature
|
||||||
|
|
@ -111,6 +110,34 @@ public class RefSeqFeature implements Transcript, Feature {
|
||||||
return overlapString.toString();
|
return overlapString.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ArrayList<GenomeLoc> exonInRefOrderCache = null;
|
||||||
|
|
||||||
|
public Integer getSortedOverlapInteger(GenomeLoc position) {
|
||||||
|
int exonNo = -1;
|
||||||
|
ArrayList<GenomeLoc> exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList<GenomeLoc>(exons);
|
||||||
|
if ( exonInRefOrderCache == null ) {
|
||||||
|
Collections.sort(exonsInReferenceOrder);
|
||||||
|
}
|
||||||
|
exonInRefOrderCache = exonsInReferenceOrder;
|
||||||
|
for ( GenomeLoc exon : exonsInReferenceOrder ) {
|
||||||
|
if ( exon.overlapsP(position) ) {
|
||||||
|
return ++exonNo;
|
||||||
|
}
|
||||||
|
++exonNo;
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public GenomeLoc getSortedExonLoc(int offset) {
|
||||||
|
ArrayList<GenomeLoc> exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList<GenomeLoc>(exons);
|
||||||
|
if ( exonInRefOrderCache == null ) {
|
||||||
|
Collections.sort(exonsInReferenceOrder);
|
||||||
|
}
|
||||||
|
exonInRefOrderCache = exonsInReferenceOrder;
|
||||||
|
return exonsInReferenceOrder.get(offset);
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
|
/** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */
|
||||||
public boolean overlapsP (GenomeLoc that) {
|
public boolean overlapsP (GenomeLoc that) {
|
||||||
return getLocation().overlapsP(that);
|
return getLocation().overlapsP(that);
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,6 @@ public class TableCodec implements ReferenceDependentFeatureCodec {
|
||||||
try {
|
try {
|
||||||
boolean isFirst = true;
|
boolean isFirst = true;
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
System.out.println(line);
|
|
||||||
if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) {
|
if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) {
|
||||||
throw new UserException.MalformedFile("TableCodec file does not have a header");
|
throw new UserException.MalformedFile("TableCodec file does not have a header");
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue