cleanup before moving files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1365 348d0f76-0448-11de-a6fe-93d51630548a
2009-08-03 19:08:24 +00:00 · 2009-08-03 19:08:24 +00:00 · 20986a03de
parent e3b08f245f
commit 20986a03de
3 changed files with 71 additions and 61 deletions
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java
@ -23,38 +23,37 @@ import java.io.File;
@ReadFilters(ZeroMappingQualityReadFilter.class)
 public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, GenotypeWriter> {
    // Control output settings
-    @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = true) public File VARIANTS_FILE;
-    @Argument(fullName = "metrics_out", shortName = "metout", doc = "File to which metrics should be written", required = false) public File METRICS_FILE = new File("/dev/stderr");
-    @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File to which metrics should be written", required = false) public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;
+    @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = true)
+    public File VARIANTS_FILE;
+    @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File to which metrics should be written", required = false)
+    public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GELI;

    // Control what goes into the variants file and what format that file should have
-    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)public Double LOD_THRESHOLD = Double.MIN_VALUE;
-    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confidient genotypes or just the variants?", required = false) public boolean GENOTYPE = false;
+    @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)
+    public Double LOD_THRESHOLD = Double.MIN_VALUE;

-    @Argument(fullName = "3BaseErrors", shortName = "3BaseErrors", doc = "Should we use a 3-base error mode (so that P(b_true != b_called | e) == e / 3?", required = false) public boolean THREE_BASE_ERRORS = false;
+    @Argument(fullName = "genotype", shortName = "genotype", doc = "Should we output confidient genotypes or just the variants?", required = false)
+    public boolean GENOTYPE = false;

-    // Control periodic reporting features
-    @Argument(fullName = "metrics_interval", shortName = "metint", doc = "Number of loci to process between metrics reports", required = false) public Integer METRICS_INTERVAL = 50000;
-    @Argument(fullName = "suppress_metrics", shortName = "printmets", doc = "If specified, don't display metrics", required = false) public Boolean SUPPRESS_METRICS = false;
+    @Argument(fullName = "3BaseErrors", shortName = "3BaseErrors", doc = "Should we use a 3-base error mode (so that P(b_true != b_called | e) == e / 3?", required = false)
+    public boolean THREE_BASE_ERRORS = false;

-    // Control what features we use in calling variants
-    //@Argument(fullName = "ignore_secondary_bases", shortName = "nosb", doc = "Ignore secondary base examination", required = false) public Boolean IGNORE_SECONDARY_BASES = false;
-    @Argument(fullName = "call_indels", shortName = "indels", doc = "Call indels as well as point mutations", required = false) public Boolean CALL_INDELS = false;
+    public enum Caller {
+        OLD_AND_BUSTED,
+        NEW_HOTNESS
+    }
+
+    @Argument(fullName = "caller", doc = "", required = false)
+    public Caller caller = Caller.OLD_AND_BUSTED;

    // Control how the genotype hypotheses are weighed
    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false) public Double heterozygosity = GenotypeLikelihoods.HUMAN_HETEROZYGOSITY;
    @Argument(fullName = "priors_hapmap", shortName = "phapmap", doc = "Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required = false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5";
    @Argument(fullName = "priors_dbsnp", shortName = "pdbsnp", doc = "Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required = false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5";

-    // Control various sample-level settings
-    @Argument(fullName = "sample_name_regex", shortName = "sample_name_regex", doc = "Replaces the sample name specified in the BAM read group with the value supplied here", required = false) public String SAMPLE_NAME_REGEX = null;
-
    @Argument(fullName = "keepQ0Bases", shortName = "keepQ0Bases", doc = "If true, then Q0 bases will be included in the genotyping calculation, and treated as Q1 -- this is really not a good idea", required = false)
    public boolean keepQ0Bases = false;

-    public AlleleMetrics metricsOut;
-    public String sampleName;
-
    public double[] plocus;
    public double[] phapmap;
    public double[] pdbsnp;
@ -103,8 +102,6 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype

    /** Initialize the walker with some sensible defaults */
    public void initialize() {
-        metricsOut = new AlleleMetrics(METRICS_FILE, LOD_THRESHOLD);
-
        plocus = GenotypeLikelihoods.computePriors(heterozygosity);
        phapmap = priorsArray(PRIORS_HAPMAP);
        pdbsnp = priorsArray(PRIORS_DBSNP);
@ -120,51 +117,31 @@ public class SingleSampleGenotyper extends LocusWalker<SSGGenotypeCall, Genotype
     * @return an AlleleFrequencyEstimate object
     */
    public SSGGenotypeCall map(RefMetaDataTracker tracker, char ref, LocusContext context) {
-        //rationalizeSampleName(context.getReads().get(0));
+        SSGGenotypeCall oldAndBusted = mapOldAndBusted(tracker, ref, context);
+        SSGGenotypeCall newHotness = mapNewHotness(tracker, ref, context);

-        // todo -- totally convoluted code!
+        if ( ! oldAndBusted.equals(newHotness) ) {
+            System.out.printf("Calls not equal:%nold: %s%nnew: %s%n", oldAndBusted, newHotness);
+        }

+        return caller == Caller.OLD_AND_BUSTED ? oldAndBusted : newHotness;
+    }
+
+    private SSGGenotypeCall mapNewHotness(RefMetaDataTracker tracker, char ref, LocusContext context) {
        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
        GenotypeLikelihoods G = makeGenotypeLikelihood(tracker);
        G.setDiscovery(GENOTYPE); // set it to discovery mode or variant detection mode
        SSGGenotypeCall geno = (SSGGenotypeCall)G.callGenotypes(tracker, ref, pileup);
-        if (geno != null) {
-            metricsOut.nextPosition(geno, tracker);
-        }
-        if (!SUPPRESS_METRICS) {
-            metricsOut.printMetricsAtLocusIntervals(METRICS_INTERVAL);
-        }
        return geno;
    }

-
-    /**
-     * Sometimes the sample names in the BAM files get screwed up.  Fix it here if we can.
-     *
-     * @param read a read from the pileup (assuming all the reads have the same sample name)
-     *
-     * @return a repaired sample name
-     */
-    private String rationalizeSampleName(SAMRecord read) {
-        String RG = (String) (read.getAttribute("RG"));
-        SAMReadGroupRecord read_group_record = read.getHeader().getReadGroup(RG);
-
-        if (read_group_record != null) {
-            String localSampleName = read.getHeader().getReadGroup(RG).getSample();
-            if (SAMPLE_NAME_REGEX != null) {
-                localSampleName = localSampleName.replaceAll(SAMPLE_NAME_REGEX, "$1");
-            }
-            if (sampleName == null) {
-                sampleName = localSampleName;
-            } else {
-                if (!sampleName.equals(localSampleName)) {
-                    throw new StingException(String.format("Samples appear to have been mixed up: expected '%s' but found '%s'.", sampleName, localSampleName));
-                }
-            }
-        }
-
-        return sampleName;
-    }
+    private SSGGenotypeCall mapOldAndBusted(RefMetaDataTracker tracker, char ref, LocusContext context) {
+        ReadBackedPileup pileup = new ReadBackedPileup(ref, context);
+        GenotypeLikelihoods G = makeGenotypeLikelihood(tracker);
+        G.setDiscovery(GENOTYPE); // set it to discovery mode or variant detection mode
+        SSGGenotypeCall geno = (SSGGenotypeCall)G.callGenotypes(tracker, ref, pileup);
+        return geno;
+    }    

    /**
     * Calls the underlying, single locus genotype of the sample
--- a/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/Genotype.java
@ -76,4 +76,11 @@ public interface Genotype {
     * @return
     */
    public Variant toVariant();
+
+    /**
+     * return a readable string representation of this genotype
+     *
+     * @return
+     */
+    public String toString();
 }
--- a/java/src/org/broadinstitute/sting/utils/genotype/calls/SSGGenotypeCall.java
+++ b/java/src/org/broadinstitute/sting/utils/genotype/calls/SSGGenotypeCall.java
@ -13,10 +13,7 @@ import org.broadinstitute.sting.utils.genotype.confidence.BayesianConfidenceScor
 import org.broadinstitute.sting.utils.genotype.confidence.ConfidenceScore;
 import org.broadinstitute.sting.utils.genotype.variant.Variant;

-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.TreeMap;
+import java.util.*;


 /**
@ -96,6 +93,35 @@ public class SSGGenotypeCall implements GenotypeCall, GenotypeOutput {
        }
    }

+    @Override
+    public boolean equals(Object other) {
+        if(other == null)
+            return false;
+        if(other instanceof SSGGenotypeCall) {
+            SSGGenotypeCall otherCall = (SSGGenotypeCall)other;
+
+            boolean likelihoodsMatch = true;
+            for ( int i = 0; i < mLikelihoods.length; i++ ) {
+                likelihoodsMatch = likelihoodsMatch && MathUtils.compareDoubles(mLikelihoods[i], otherCall.mLikelihoods[i]) == 0;
+            }
+            //System.out.printf("likelihoodsMatch = %b%n", likelihoodsMatch);
+
+            return this.mRefBase.equals(otherCall.mRefBase) &&
+                   this.mPloidy == otherCall.mPloidy &&
+                   MathUtils.compareDoubles(this.bestNext, otherCall.bestNext) == 0 &&
+                   MathUtils.compareDoubles(this.bestRef, otherCall.bestRef) == 0 &&
+                   this.readDepth == otherCall.readDepth &&
+                   MathUtils.compareDoubles(this.rmsMapping, otherCall.rmsMapping) == 0 &&
+                   likelihoodsMatch;
+        }
+        return false;
+    }
+
+    public String toString() {
+        return String.format("ref=%s depth=%d rmsMAPQ=%.2f bestVRef=%.2f bestVNext=%.2f likelihoods=%s",
+                mRefBase, readDepth, rmsMapping, bestRef, bestNext, Arrays.toString(mLikelihoods));
+    }
+
    /**
     * calculate the rms , given the read pileup
     *
@ -128,7 +154,7 @@ public class SSGGenotypeCall implements GenotypeCall, GenotypeOutput {
     */
    @Override
    public boolean isVariation() {
-        return mGenotypes.get(mGenotypes.descendingKeySet().first()).isVariant(mRefBase.charAt(0));
+        return mGenotypes.get(mGenotypes.descendingKeySet().first()).isVariant(getReferencebase());
    }

    /**