From 5ab2e83904ea95dd141410476a7aeb5307f67420 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Thu, 7 Jul 2011 06:15:10 -0400 Subject: [PATCH] a) Cosmetic modifications to IndelType annotation. b) Add ability to select samples from a file in PrintReads, c) fixes to shaped AF random selection in SelectVariants --- .../sting/gatk/walkers/PrintReadsWalker.java | 39 +++++++++++++++++++ .../gatk/walkers/annotator/IndelType.java | 24 ++++++++++-- .../walkers/variantutils/SelectVariants.java | 18 +++++---- 3 files changed, 69 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 07938d322..2b6673003 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -32,9 +32,14 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; +import java.io.File; import java.io.PrintStream; +import java.util.Collection; +import java.util.Set; +import java.util.TreeSet; /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear @@ -54,6 +59,13 @@ public class PrintReadsWalker extends ReadWalker { String platform = null; // E.g. ILLUMINA, 454 @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) int nReadsToPrint = -1; + @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) + public Set sampleFiles; + @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set sampleNames; + + private TreeSet samplesToChoose = new TreeSet(); + private boolean NO_SAMPLES_SPECIFIED = false; /** * The initialize function. @@ -61,6 +73,17 @@ public class PrintReadsWalker extends ReadWalker { public void initialize() { if ( platform != null ) platform = platform.toUpperCase(); + + Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); + samplesToChoose.addAll(samplesFromFile); + + if (sampleNames != null) + samplesToChoose.addAll(sampleNames); + + if(samplesToChoose.isEmpty()) { + NO_SAMPLES_SPECIFIED = true; + } + } /** @@ -87,6 +110,22 @@ public class PrintReadsWalker extends ReadWalker { if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform)) return false; } + if (!NO_SAMPLES_SPECIFIED ) { + // user specified samples to select + String readSample = read.getReadGroup().getSample(); + boolean found = false; + for (String sampleSelected : samplesToChoose) { + if (readSample.equalsIgnoreCase(sampleSelected)) { + found = true; + break; + } + + } + + if (!found) + return false; + } + // check if we've reached the output limit if ( nReadsToPrint == 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java index ee8b01d7d..af4d2ff28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java @@ -24,11 +24,27 @@ public class IndelType implements InfoFieldAnnotation, ExperimentalAnnotation { public Map annotate(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { int run; - if ( vc.isIndel() && vc.isBiallelic() ) { + if (vc.isMixed()) { + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%s", "MIXED")); + return map; + + } + else if ( vc.isIndel() ) { String type=""; - ArrayList inds = IndelUtils.findEventClassificationIndex(vc, ref); - for (int k : inds) { - type = type+ IndelUtils.getIndelClassificationName(k)+"."; + if (!vc.isBiallelic()) + type = "MULTIALLELIC_INDEL"; + else { + if (vc.isInsertion()) + type = "INS."; + else if (vc.isDeletion()) + type = "DEL."; + else + type = "OTHER."; + ArrayList inds = IndelUtils.findEventClassificationIndex(vc, ref); + for (int k : inds) { + type = type+ IndelUtils.getIndelClassificationName(k)+"."; + } } Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%s", type)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index aa8a49175..80a497d33 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -96,8 +96,9 @@ public class SelectVariants extends RodWalker { @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false) private boolean KEEP_AF_SPECTRUM = false; + @Hidden @Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false) - private File AF_FILE = null; + private File AF_FILE = new File(""); @Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) private String FAMILY_STRUCTURE = ""; @@ -240,7 +241,7 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + fractionRandom + "% of the variants at random from the variant track"); - if (AF_FILE != null) { + if (KEEP_AF_SPECTRUM) { try { afBreakpoints = new ArrayList(); afBoosts = new ArrayList(); @@ -328,8 +329,7 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. // We then pick a variant with probablity AF*desiredFraction - - if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { + if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); double af; @@ -354,21 +354,23 @@ public class SelectVariants extends RodWalker { // now boost af by table read from file if desired //double bkpt = 0.0; int bkidx = 0; - if (AF_FILE != null) { + if (!afBreakpoints.isEmpty()) { for ( Double bkpt : afBreakpoints) { if (af < bkpt + bkDelta) break; else bkidx++; } - afBoost = afBreakpoints.get(bkidx); - System.out.format("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost); + if (bkidx >=afBoosts.size()) + bkidx = afBoosts.size()-1; + afBoost = afBoosts.get(bkidx); + //System.out.formatPrin("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost); } //System.out.format("%s .. %4.4f\n",afo.toString(), af); - if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * af * afBoost) + if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost) vcfWriter.add(sub, ref.getBase()); }