Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

2011-11-06 10:26:19 -05:00 · 2011-11-06 10:26:19 -05:00 · 5c565d28b9
parent ebdced412c 3517489a22
commit 5c565d28b9
12 changed files with 44 additions and 58 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java
@ -131,7 +131,7 @@ public class AlignmentContextUtils {
        }
    }

-    public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup, String assumedSingleSample) {
+    public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup) {
        return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup));
    }

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@ -164,10 +164,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
    @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit")
    protected Boolean LIST = false;

-    @Hidden
-    @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
-    protected String ASSUME_SINGLE_SAMPLE = null;
-
    @Hidden
    @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
    protected boolean indelsOnly = false;
@ -213,11 +209,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
        List<String> rodName = Arrays.asList(variantCollection.variants.getName());
        Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);

-        // if there are no valid samples, warn the user
-        if ( samples.size() == 0 ) {
-            logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
-        }
-
        if ( USE_ALL_ANNOTATIONS )
            engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit());
        else
@ -301,9 +292,9 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
        Map<String, AlignmentContext> stratifiedContexts;
        if ( BaseUtils.simpleBaseToBaseIndex(ref.getBase()) != -1 ) {
            if ( ! context.hasExtendedEventPileup() ) {
-                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup(), ASSUME_SINGLE_SAMPLE);
+                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getBasePileup());
            } else {
-                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup(), ASSUME_SINGLE_SAMPLE);
+                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(context.getExtendedEventPileup());
            }
            if ( stratifiedContexts != null ) {
                annotatedVCs = new ArrayList<VariantContext>(VCs.size());
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCalcLikelihoods.java
@ -39,7 +39,6 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;

 import java.util.HashSet;
 import java.util.Set;
-import java.util.TreeSet;


 /**
@ -71,12 +70,7 @@ public class UGCalcLikelihoods extends LocusWalker<VariantCallContext, Integer>

    public void initialize() {
        // get all of the unique sample names
-        // if we're supposed to assume a single sample, do so
-        Set<String> samples = new TreeSet<String>();
-        if ( UAC.ASSUME_SINGLE_SAMPLE != null )
-            samples.add(UAC.ASSUME_SINGLE_SAMPLE);
-        else
-            samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+        Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());

        UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples);

--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -96,11 +96,6 @@ public class UnifiedArgumentCollection {
    @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when in GENOTYPE_MODE = GENOTYPE_GIVEN_ALLELES", required=false)
    public RodBinding<VariantContext> alleles;

-    // control the error modes
-    @Hidden
-    @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false)
-    public String ASSUME_SINGLE_SAMPLE = null;
-
    /**
     * The minimum confidence needed in a given base for it to be used in variant calling.  Note that the base quality of a base
     * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
@ -170,7 +165,6 @@ public class UnifiedArgumentCollection {
        uac.GenotypingMode = GenotypingMode;
        uac.OutputMode = OutputMode;
        uac.COMPUTE_SLOD = COMPUTE_SLOD;
-        uac.ASSUME_SINGLE_SAMPLE = ASSUME_SINGLE_SAMPLE;
        uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING;
        uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING;
        uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@ -206,12 +206,7 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
     **/
    public void initialize() {
        // get all of the unique sample names
-        // if we're supposed to assume a single sample, do so
-        Set<String> samples = new TreeSet<String>();
-        if ( UAC.ASSUME_SINGLE_SAMPLE != null )
-            samples.add(UAC.ASSUME_SINGLE_SAMPLE);
-        else
-            samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
+        Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());

        // initialize the verbose writer
        if ( verboseWriter != null )
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@ -106,12 +106,7 @@ public class UnifiedGenotyperEngine {
    // ---------------------------------------------------------------------------------------------------------
    @Requires({"toolkit != null", "UAC != null"})
    public UnifiedGenotyperEngine(GenomeAnalysisEngine toolkit, UnifiedArgumentCollection UAC) {
-        this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null,
-                // get the number of samples
-                // if we're supposed to assume a single sample, do so
-                UAC.ASSUME_SINGLE_SAMPLE != null ?
-                        new TreeSet<String>(Arrays.asList(UAC.ASSUME_SINGLE_SAMPLE)) :
-                        SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()));
+        this(toolkit, UAC, Logger.getLogger(UnifiedGenotyperEngine.class), null, null, SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()));
    }

    @Requires({"toolkit != null", "UAC != null", "logger != null", "samples != null && samples.size() > 0"})
@ -253,7 +248,7 @@ public class UnifiedGenotyperEngine {
                pileup = rawContext.getExtendedEventPileup();
            else if (rawContext.hasBasePileup())
                pileup = rawContext.getBasePileup();
-            stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+            stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

            vc = annotationEngine.annotateContext(tracker, ref, stratifiedContexts, vc);
        }
@ -435,7 +430,7 @@ public class UnifiedGenotyperEngine {
                pileup = rawContext.getExtendedEventPileup();
            else if (rawContext.hasBasePileup())
                pileup = rawContext.getBasePileup();
-            stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+            stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

            vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall);
        }
@ -569,7 +564,7 @@ public class UnifiedGenotyperEngine {
                    return null;

                // stratify the AlignmentContext and cut by sample
-                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);

            } else {

@ -586,12 +581,12 @@ public class UnifiedGenotyperEngine {
                    return null;

                // stratify the AlignmentContext and cut by sample
-                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
+                stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
            }
        } else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) {

            // stratify the AlignmentContext and cut by sample
-            stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
+            stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup());

            if( !(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) ) {
                int numDeletions = 0;
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@ -817,7 +817,8 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
                        // For now, we will just arbitrarily add 10 to the mapping quality. [EB, 6/7/2010].
                        // TODO -- we need a better solution here
                        GATKSAMRecord read = aRead.getRead();
-                        read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254));
+                        if ( read.getMappingQuality() != 255 ) // 255 == Unknown, so don't modify it
+                            read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254));

                        // before we fix the attribute tags we first need to make sure we have enough of the reference sequence
                        int neededBasesToLeft = leftmostIndex - read.getAlignmentStart();
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java
@ -58,15 +58,12 @@ public class RandomlySplitVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="fractionToOut1", shortName="fraction", doc="Fraction of records to be placed in out1 (must be 0 >= fraction <= 1); all other records are placed in out2", required=false)
    protected double fraction = 0.5;

-    protected int iFraction;
-
    /**
     * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher
     */
    public void initialize() {
        if ( fraction < 0.0 || fraction > 1.0 )
            throw new UserException.BadArgumentValue("fractionToOut1", "this value needs to be a number between 0 and 1");
-        iFraction = (int)(fraction * 1000.0);

        // setup the header info
        final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
@ -93,8 +90,8 @@ public class RandomlySplitVariants extends RodWalker<Integer, Integer> {

        Collection<VariantContext> vcs = tracker.getValues(variantCollection.variants, context.getLocation());
        for ( VariantContext vc : vcs ) {
-            int random = GenomeAnalysisEngine.getRandomGenerator().nextInt(1000);
-            if ( random < iFraction )
+            double random = GenomeAnalysisEngine.getRandomGenerator().nextDouble();
+            if ( random < fraction )
                vcfWriter1.add(vc);
            else
                vcfWriter2.add(vc);
@ -107,5 +104,8 @@ public class RandomlySplitVariants extends RodWalker<Integer, Integer> {

    public Integer reduce(Integer value, Integer sum) { return value + sum; }

-    public void onTraversalDone(Integer result) { logger.info(result + " records processed."); }
+    public void onTraversalDone(Integer result) {
+        logger.info(result + " records processed.");
+        vcfWriter2.close();
+    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -162,19 +162,27 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
     * @return a feature, (not guaranteed complete) that has the correct start and stop
     */
    public Feature decodeLoc(String line) {
-        String[] locParts = new String[6];
+        lineNo++;
+
+        // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
+        if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
+
+        // our header cannot be null, we need the genotype sample names and counts
+        if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
+
+        final String[] locParts = new String[6];
        int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);

        if ( nParts != 6 )
            throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);

        // get our alleles (because the end position depends on them)
-        String ref = getCachedString(locParts[3].toUpperCase());
-        String alts = getCachedString(locParts[4].toUpperCase());
-        List<Allele> alleles = parseAlleles(ref, alts, lineNo);
+        final String ref = getCachedString(locParts[3].toUpperCase());
+        final String alts = getCachedString(locParts[4].toUpperCase());
+        final List<Allele> alleles = parseAlleles(ref, alts, lineNo);

        // find out our location
-        int start = Integer.valueOf(locParts[1]);
+        final int start = Integer.valueOf(locParts[1]);
        int stop = start;

        // ref alleles don't need to be single bases for monomorphic sites
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java
@ -124,6 +124,14 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest {
        executeTest("using expression", spec);
    }

+    @Test
+    public void testUsingExpressionWithID() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.ID -L " + validationDataLocation + "vcfexample3empty.vcf", 1,
+                Arrays.asList("4a6f0675242f685e9072c1da5ad9e715"));
+        executeTest("using expression with ID", spec);
+    }
+
    @Test
    public void testTabixAnnotations() {
        final String MD5 = "13269d5a2e16f06fd755cc0fb9271acf";
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java
@ -9,7 +9,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
    private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval";
    private static String fundamentalTestVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf";
    private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.vcf";
-    private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.HG00625.vcf";
+    private static String fundamentalTestSNPsOneSampleVCF = variantEvalTestDataRoot + "/" + "FundamentalsTest.annotated.db.subset.final.NA12045.vcf";

    private static String cmdRoot = "-T VariantEval" +
            " -R " + b36KGReference;
@ -359,7 +359,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {

    @Test
    public void testPerSampleAndSubsettedSampleHaveSameResults() {
-        String md5 = "b0565ac61b2860248e4abd478a177b5e";
+        String md5 = "7425ca5c439afd7bb33ed5cfea02c2b3";

        WalkerTestSpec spec = new WalkerTestSpec(
                buildCommandLine(
@ -369,7 +369,7 @@ public class VariantEvalIntegrationTest extends WalkerTest {
                        "--eval " + fundamentalTestSNPsVCF,
                        "-noEV",
                        "-EV CompOverlap",
-                        "-sn HG00625",
+                        "-sn NA12045",
                        "-noST",
                        "-L " + fundamentalTestSNPsVCF,
                        "-o %s"
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKScatterFunction.scala
@ -56,7 +56,7 @@ trait GATKScatterFunction extends ScatterFunction {
  override def init() {
    this.originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK]
    this.referenceSequence = this.originalGATK.reference_sequence
-    if (this.originalGATK.intervals.isEmpty && this.originalGATK.intervalsString.isEmpty) {
+    if (this.originalGATK.intervals.isEmpty && (this.originalGATK.intervalsString == null || this.originalGATK.intervalsString.isEmpty)) {
      this.intervals ++= GATKScatterFunction.getGATKIntervals(this.referenceSequence, List.empty[String]).contigs
    } else {
      this.intervals ++= this.originalGATK.intervals.map(_.toString)