diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java index ee7a0dc1c..c3ac13450 100644 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/SingleSampleGenotyper.java @@ -2,57 +2,61 @@ package org.broadinstitute.sting.playground.gatk.walkers; import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.LocusContext; import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.ReadFilters; +import org.broadinstitute.sting.gatk.walkers.WalkerName; import org.broadinstitute.sting.playground.utils.AlleleFrequencyEstimate; import org.broadinstitute.sting.playground.utils.AlleleMetrics; import org.broadinstitute.sting.playground.utils.GenotypeLikelihoods; import org.broadinstitute.sting.playground.utils.IndelLikelihood; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.BasicPileup; -import org.broadinstitute.sting.utils.ReadBackedPileup; -import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.utils.genotype.GenotypeWriterFactory; +import org.broadinstitute.sting.utils.genotype.LikelihoodObject; import java.io.File; -import java.io.FileNotFoundException; import java.io.PrintStream; +import java.io.FileNotFoundException; import java.util.List; @ReadFilters(ZeroMappingQualityReadFilter.class) public class SingleSampleGenotyper extends LocusWalker { // Control output settings - @Argument(fullName="variants_out", shortName="varout", doc="File to which variants should be written", required=true) public File VARIANTS_FILE; - @Argument(fullName="metrics_out", shortName="metout", doc="File to which metrics should be written", required=false) public File METRICS_FILE = new File("/dev/stderr"); + @Argument(fullName = "variants_out", shortName = "varout", doc = "File to which variants should be written", required = true) public File VARIANTS_FILE; + @Argument(fullName = "metrics_out", shortName = "metout", doc = "File to which metrics should be written", required = false) public File METRICS_FILE = new File("/dev/stderr"); + @Argument(fullName = "variant_output_format", shortName = "vf", doc = "File to which metrics should be written", required = false) public GenotypeWriterFactory.GENOTYPE_FORMAT VAR_FORMAT = GenotypeWriterFactory.GENOTYPE_FORMAT.GLF; // Control what goes into the variants file and what format that file should have - @Argument(fullName="lod_threshold", shortName="lod", doc="The lod threshold on which variants should be filtered", required=false) public Double LOD_THRESHOLD = 5.0; - @Argument(fullName="format_geli", shortName="geli", doc="Output variant calls in Geli/Picard format", required=false) public boolean GELI_OUTPUT_FORMAT = false; + @Argument(fullName = "lod_threshold", shortName = "lod", doc = "The lod threshold on which variants should be filtered", required = false)public Double LOD_THRESHOLD = Double.MIN_VALUE; // Control periodic reporting features - @Argument(fullName="metrics_interval", shortName="metint", doc="Number of loci to process between metrics reports", required=false) public Integer METRICS_INTERVAL = 50000; - @Argument(fullName="suppress_metrics", shortName="printmets", doc="If specified, don't display metrics", required=false) public Boolean SUPPRESS_METRICS = false; + @Argument(fullName = "metrics_interval", shortName = "metint", doc = "Number of loci to process between metrics reports", required = false) public Integer METRICS_INTERVAL = 50000; + @Argument(fullName = "suppress_metrics", shortName = "printmets", doc = "If specified, don't display metrics", required = false) public Boolean SUPPRESS_METRICS = false; // Control what features we use in calling variants - @Argument(fullName="ignore_secondary_bases", shortName="nosb", doc="Ignore secondary base examination", required=false) public Boolean IGNORE_SECONDARY_BASES = false; - @Argument(fullName="call_indels", shortName="indels", doc="Call indels as well as point mutations", required=false) public Boolean CALL_INDELS = false; + @Argument(fullName = "ignore_secondary_bases", shortName = "nosb", doc = "Ignore secondary base examination", required = false) public Boolean IGNORE_SECONDARY_BASES = false; + @Argument(fullName = "call_indels", shortName = "indels", doc = "Call indels as well as point mutations", required = false) public Boolean CALL_INDELS = false; // Control how the genotype hypotheses are weighed - @Argument(fullName="priors_any_locus", shortName="plocus", doc="Comma-separated prior likelihoods for any locus (homref,het,homvar)", required=false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_hapmap", shortName="phapmap", doc="Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required=false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_dbsnp", shortName="pdbsnp", doc="Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required=false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5"; - @Argument(fullName="priors_2nd_on", shortName="p2ndon", doc="Comma-separated prior likelihoods for the secondary bases of primary on-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_ON = "0.000,0.302,0.366,0.142,0.000,0.548,0.370,0.000,0.319,0.000"; - @Argument(fullName="priors_2nd_off", shortName="p2ndoff", doc="Comma-separated prior likelihoods for the secondary bases of primary off-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required=false) public String PRIORS_2ND_OFF = "0.480,0.769,0.744,0.538,0.575,0.727,0.768,0.589,0.762,0.505"; + @Argument(fullName = "priors_any_locus", shortName = "plocus", doc = "Comma-separated prior likelihoods for any locus (homref,het,homvar)", required = false) public String PRIORS_ANY_LOCUS = "0.999,1e-3,1e-5"; + @Argument(fullName = "priors_hapmap", shortName = "phapmap", doc = "Comma-separated prior likelihoods for Hapmap loci (homref,het,homvar)", required = false) public String PRIORS_HAPMAP = "0.999,1e-3,1e-5"; + @Argument(fullName = "priors_dbsnp", shortName = "pdbsnp", doc = "Comma-separated prior likelihoods for dbSNP loci (homref,het,homvar)", required = false) public String PRIORS_DBSNP = "0.999,1e-3,1e-5"; + @Argument(fullName = "priors_2nd_on", shortName = "p2ndon", doc = "Comma-separated prior likelihoods for the secondary bases of primary on-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required = false) public String PRIORS_2ND_ON = "0.000,0.302,0.366,0.142,0.000,0.548,0.370,0.000,0.319,0.000"; + @Argument(fullName = "priors_2nd_off", shortName = "p2ndoff", doc = "Comma-separated prior likelihoods for the secondary bases of primary off-genotype bases (AA,AC,AG,AT,CC,CG,CT,GG,GT,TT)", required = false) public String PRIORS_2ND_OFF = "0.480,0.769,0.744,0.538,0.575,0.727,0.768,0.589,0.762,0.505"; // Control various sample-level settings - @Argument(fullName="sample_name_regex", shortName="sample_name_regex", doc="Replaces the sample name specified in the BAM read group with the value supplied here", required=false) public String SAMPLE_NAME_REGEX = null; + @Argument(fullName = "sample_name_regex", shortName = "sample_name_regex", doc = "Replaces the sample name specified in the BAM read group with the value supplied here", required = false) public String SAMPLE_NAME_REGEX = null; public AlleleMetrics metricsOut; - public PrintStream variantsOut; + public PrintStream variantsOut; public String sampleName; + private GenotypeWriter mGenotypeWriter; public double[] plocus; public double[] phapmap; @@ -65,14 +69,17 @@ public class SingleSampleGenotyper extends LocusWalker reads = context.getReads(); List offsets = context.getOffsets(); - // Handle indels, but don't do anything with the result yet. + // Handle indels, but don't do anything with the result yet. IndelLikelihood I = (CALL_INDELS) ? callIndel(context, reads, offsets) : null; // Handle single-base polymorphisms. @@ -189,11 +213,12 @@ public class SingleSampleGenotyper extends LocusWalker reads, List offsets) { @@ -207,17 +232,17 @@ public class SingleSampleGenotyper extends LocusWalker reads, List offsets) { String[] indels = BasicPileup.indelPileup(reads, offsets); IndelLikelihood indelCall = new IndelLikelihood(indels, 1e-4); - if (! indelCall.getType().equals("ref")) { + if (!indelCall.getType().equals("ref")) { System.out.printf("INDEL %s %s\n", context.getLocation(), indelCall); } @@ -244,7 +270,8 @@ public class SingleSampleGenotyper extends LocusWalker= 5 variant, output it to disk. * - * @param alleleFreq an AlleleFrequencyEstimage object for the variant. - * @param sum accumulator for the reduce. + * @param alleleFreq an AlleleFrequencyEstimage object for the variant. + * @param sum accumulator for the reduce. + * * @return an empty string */ public String reduce(AlleleFrequencyEstimate alleleFreq, String sum) { if (alleleFreq != null && alleleFreq.lodVsRef >= LOD_THRESHOLD) { - String line = GELI_OUTPUT_FORMAT ? alleleFreq.asGeliString() : alleleFreq.asTabularString(); - variantsOut.println(line); + if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GELI) { + variantsOut.println(alleleFreq.asGeliString()); + } else if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.TABULAR) { + variantsOut.println(alleleFreq.asTabularString()); + } else if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GLF) { + SAMSequenceRecord rec = GenomeLocParser.getContigInfo(alleleFreq.location.getContig()); + LikelihoodObject obj = new LikelihoodObject(alleleFreq.posteriors, LikelihoodObject.LIKELIHOOD_TYPE.LOG); + obj.setLikelihoodType(LikelihoodObject.LIKELIHOOD_TYPE.NEGITIVE_LOG); + this.mGenotypeWriter.addGenotypeCall(rec,(int)alleleFreq.location.getStart(),0.0f,alleleFreq.ref,alleleFreq.depth,obj); + } } - - return ""; - } - /** - * Close the variant file. - */ - public void onTraversalDone() { - variantsOut.close(); - } + return ""; + } + + /** Close the variant file. */ + public void onTraversalDone() { + if (this.VAR_FORMAT == GenotypeWriterFactory.GENOTYPE_FORMAT.GLF) { + mGenotypeWriter.close(); + } else { + this.variantsOut.close(); + } + } } diff --git a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java index 24467274f..cb820298c 100755 --- a/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java +++ b/java/src/org/broadinstitute/sting/playground/utils/AlleleFrequencyEstimate.java @@ -1,26 +1,12 @@ package org.broadinstitute.sting.playground.utils; import org.broadinstitute.sting.playground.gatk.walkers.AlleleFrequencyWalker; -import java.util.Arrays; -import java.lang.Math; import org.broadinstitute.sting.utils.GenomeLoc; +import java.util.Arrays; + public class AlleleFrequencyEstimate { - /* - static - { - boolean assertsEnabled = false; - assert assertsEnabled = true; // Intentional side effect!!! - if (!assertsEnabled) - { - System.err.printf("\n\n\nERROR: You must run with asserts enabled. \"java -ea\".\n\n\n"); - throw new RuntimeException("Asserts must be enabled!"); - } - } - */ - - //AlleleFrequencyEstimate(); public GenomeLoc location; public char ref; public char alt; @@ -34,7 +20,7 @@ public class AlleleFrequencyEstimate { public int depth; public String notes; public String bases; - public double[][] quals; + //public double[][] quals; public double[] posteriors; public String sample_name; public int n_ref; @@ -48,43 +34,6 @@ public class AlleleFrequencyEstimate { public AlleleFrequencyEstimate(GenomeLoc location, char ref, char alt, int N, double qhat, double qstar, double lodVsRef, double lodVsNextBest, double pBest, double pRef, int depth, String bases, double[][] quals, double[] posteriors, String sample_name) { - /* - if( Double.isNaN(lodVsRef)) { System.out.printf("%s: lodVsRef is NaN\n", location.toString()); } - if( Double.isNaN(lodVsNextBest)) { System.out.printf("%s lodVsNextBest is NaN\n", location.toString()); } - if( Double.isNaN(qhat)) { System.out.printf("%s qhat is NaN\n", location.toString()); } - if( Double.isNaN(qstar)) { System.out.printf("%s qstar is NaN\n", location.toString()); } - if( Double.isNaN(pBest)) { System.out.printf("%s pBest is NaN\n", location.toString()); } - if( Double.isNaN(pRef)) { System.out.printf("%s pRef is NaN (%c %s)\n", location.toString(), ref, bases); } - - if( Double.isInfinite(lodVsRef)) - { - System.out.printf("lodVsRef is Infinite: %s %c %s\n", location.toString(), ref, bases); - for (int i = 0; i < posteriors.length; i++) - { - System.out.printf("POSTERIOR %d %f\n", i, posteriors[i]); - } - } - if( Double.isInfinite(lodVsNextBest)) { System.out.printf("lodVsNextBest is Infinite\n"); } - if( Double.isInfinite(qhat)) { System.out.printf("qhat is Infinite\n"); } - if( Double.isInfinite(qstar)) { System.out.printf("qstar is Infinite\n"); } - if( Double.isInfinite(pBest)) { System.out.printf("pBest is Infinite\n"); } - if( Double.isInfinite(pRef)) { System.out.printf("pRef is Infinite\n"); } - - assert(! Double.isNaN(lodVsRef)); - assert(! Double.isNaN(lodVsNextBest)); - assert(! Double.isNaN(qhat)); - assert(! Double.isNaN(qstar)); - assert(! Double.isNaN(pBest)); - assert(! Double.isNaN(pRef)); - - assert(! Double.isInfinite(lodVsRef)); - assert(! Double.isInfinite(lodVsNextBest)); - assert(! Double.isInfinite(qhat)); - assert(! Double.isInfinite(qstar)); - assert(! Double.isInfinite(pBest)); - assert(! Double.isInfinite(pRef)); - */ - this.location = location; this.ref = ref; this.alt = alt; @@ -98,7 +47,7 @@ public class AlleleFrequencyEstimate { this.depth = depth; this.notes = ""; this.bases = bases; - this.quals = quals; + //this.quals = quals; this.posteriors = posteriors; this.sample_name = sample_name; } diff --git a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java index a1022b0e1..0da7b7871 100644 --- a/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java +++ b/java/src/org/broadinstitute/sting/playground/utils/GenotypeLikelihoods.java @@ -37,7 +37,19 @@ public class GenotypeLikelihoods { } public double[] likelihoods; - public String[] genotypes; + public static String[] genotypes = new String[10]; + static { + genotypes[0] = "AA"; + genotypes[1] = "AC"; + genotypes[2] = "AG"; + genotypes[3] = "AT"; + genotypes[4] = "CC"; + genotypes[5] = "CG"; + genotypes[6] = "CT"; + genotypes[7] = "GG"; + genotypes[8] = "GT"; + genotypes[9] = "TT"; + } public int coverage; // The genotype priors; @@ -75,21 +87,12 @@ public class GenotypeLikelihoods { this.priorHomVar = priorHomVar; likelihoods = new double[10]; - genotypes = new String[10]; + coverage = 0; for (int i = 0; i < likelihoods.length; i++) { likelihoods[i] = Math.log10(0.1); } - genotypes[0] = "AA"; - genotypes[1] = "AC"; - genotypes[2] = "AG"; - genotypes[3] = "AT"; - genotypes[4] = "CC"; - genotypes[5] = "CG"; - genotypes[6] = "CT"; - genotypes[7] = "GG"; - genotypes[8] = "GT"; - genotypes[9] = "TT"; + for (int genotypeIndex = 0; genotypeIndex < 10; genotypeIndex++) { onNextBestBasePriors.put(genotypes[genotypeIndex], p2ndon[genotypeIndex]); @@ -405,4 +408,17 @@ public class GenotypeLikelihoods { public void addIndelLikelihood(IndelLikelihood indel_likelihood) { this.indel_likelihood = indel_likelihood; } public IndelLikelihood getIndelLikelihood() { return this.indel_likelihood; } + // TODO: this is bad, but the formats disagree now. + public static void toGLFGenotypePattern() { + genotypes[0] = "AA"; + genotypes[1] = "AT"; + genotypes[2] = "AC"; + genotypes[3] = "AG"; + genotypes[4] = "CC"; + genotypes[5] = "CT"; + genotypes[6] = "CG"; + genotypes[7] = "GG"; + genotypes[8] = "GT"; + genotypes[9] = "TT"; + } } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GeliAdapter.java b/java/src/org/broadinstitute/sting/utils/genotype/GeliAdapter.java index ac54372fc..1903c0d02 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/GeliAdapter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/GeliAdapter.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils.genotype; import edu.mit.broad.picard.genotype.geli.GeliFileWriter; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceRecord; import java.io.File; @@ -56,18 +57,16 @@ public class GeliAdapter implements GenotypeWriter { /** - * add a single point genotype call to the + * add a single point genotype call to the genotype likelihood file * - * @param contigName the name of the contig you're calling in - * @param contigLength the contig length + * @param contig the contig you're calling in * @param position the position on the contig * @param referenceBase the reference base * @param readDepth the read depth at the specified position * @param likelihoods the likelihoods of each of the possible alleles */ @Override - public void addGenotypeCall(String contigName, - int contigLength, + public void addGenotypeCall(SAMSequenceRecord contig, int position, float rmsMapQuals, char referenceBase, @@ -79,8 +78,7 @@ public class GeliAdapter implements GenotypeWriter { /** * add a variable length call to the genotyper * - * @param contigName the name of the contig you're calling in - * @param contigLength the contig length + * @param contig the contig you're calling in * @param position the position on the genome * @param rmsMapQuals the root mean square of the mapping qualities * @param readDepth the read depth @@ -90,7 +88,7 @@ public class GeliAdapter implements GenotypeWriter { * @param hetLikelihood the heterozygous likelihood */ @Override - public void addVariableLengthCall(String contigName, int contigLength, int position, float rmsMapQuals, int readDepth, char refBase, IndelLikelihood firstHomZyg, IndelLikelihood secondHomZyg, byte hetLikelihood) { + public void addVariableLengthCall(SAMSequenceRecord contig, int position, float rmsMapQuals, int readDepth, char refBase, IndelLikelihood firstHomZyg, IndelLikelihood secondHomZyg, byte hetLikelihood) { throw new UnsupportedOperationException("Geli format does not support variable length allele calls"); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java index 483754dd9..22966f8ae 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriter.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.genotype; +import net.sf.samtools.SAMSequenceRecord; + /* * Copyright (c) 2009 The Broad Institute * @@ -33,18 +35,17 @@ package org.broadinstitute.sting.utils.genotype; * The interface for storing genotype calls. */ public interface GenotypeWriter { + /** * add a single point genotype call to the * - * @param contigName the name of the contig you're calling in - * @param contigLength the contig length + * @param contig the contig you're calling in * @param position the position on the contig * @param referenceBase the reference base * @param readDepth the read depth at the specified position * @param likelihoods the likelihoods of each of the possible alleles */ - public void addGenotypeCall(String contigName, - int contigLength, + public void addGenotypeCall(SAMSequenceRecord contig, int position, float rmsMapQuals, char referenceBase, @@ -54,8 +55,7 @@ public interface GenotypeWriter { /** * add a variable length call to the genotyper * - * @param contigName the name of the contig you're calling in - * @param contigLength the contig length + * @param contig the contig you're calling in * @param position the position on the genome * @param rmsMapQuals the root mean square of the mapping qualities * @param readDepth the read depth @@ -64,8 +64,7 @@ public interface GenotypeWriter { * @param secondHomZyg the second homozygous indel (if present, null if not) * @param hetLikelihood the heterozygous likelihood */ - public void addVariableLengthCall(String contigName, - int contigLength, + public void addVariableLengthCall(SAMSequenceRecord contig, int position, float rmsMapQuals, int readDepth, diff --git a/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriterFactory.java b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriterFactory.java new file mode 100644 index 000000000..1f766f83d --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/GenotypeWriterFactory.java @@ -0,0 +1,38 @@ +package org.broadinstitute.sting.utils.genotype; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.genotype.glf.GLFWriter; + +import java.io.File; + + +/** + * @author aaron + *

+ * Class GenotypeWriterFactory + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class GenotypeWriterFactory { + /** available genotype writers */ + public enum GENOTYPE_FORMAT { + GELI, GLF, GFF, TABULAR; + } + + /** + * create a genotype writer + * @param format the format + * @param header the sam file header + * @param destination the destination file + * @return the genotype writer object + */ + public static GenotypeWriter create(GENOTYPE_FORMAT format, SAMFileHeader header, File destination) { + switch (format) { + case GLF: + return new GLFWriter(header.toString(), destination); + case GELI: + return new GeliAdapter(destination, header); + } + return null; + } +} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/LikelihoodObject.java b/java/src/org/broadinstitute/sting/utils/genotype/LikelihoodObject.java index 58bb4ec8b..64cf05d98 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/LikelihoodObject.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/LikelihoodObject.java @@ -1,12 +1,12 @@ package org.broadinstitute.sting.utils.genotype; -import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods; import edu.mit.broad.picard.genotype.DiploidGenotype; +import edu.mit.broad.picard.genotype.geli.GenotypeLikelihoods; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.StingException; import java.util.HashMap; -import net.sf.samtools.SAMFileHeader; - /* * Copyright (c) 2009 The Broad Institute @@ -35,43 +35,52 @@ import net.sf.samtools.SAMFileHeader; /** * @author aaron - * - * Class LikelyhoodObject - * - * An object used to store likelyhood information for genotypes. Genotype - * likelihoods are assumed to be infinite (negitive log likelihood), unless set. - * This allows the consumer to make an empty LikelihoodObject, and just set - * those values which have associated likelihood values. - * + *

+ * Class LikelyhoodObject + *

+ * An object used to store likelyhood information for genotypes. Genotype + * likelihoods are assumed to be infinite (negitive log likelihood), unless set. + * This allows the consumer to make an empty LikelihoodObject, and just set + * those values which have associated likelihood values. */ public class LikelihoodObject { + // our possible genotypes, in order according to GLFv3 public enum GENOTYPE { AA, AT, AC, AG, CC, CT, CG, GG, GT, TT } + // our pileup of bases + //final private String basePileup; + // possible types of likihoods to store + public enum LIKELIHOOD_TYPE { NEGITIVE_LOG, LOG, RAW; } + // our qhet and qstar values; wait, what? + // TODO: are these really needed here? We have them to support the tabular output format only. + //private double qhat; + //private double qstar; + // our liklihood storage type protected LIKELIHOOD_TYPE mLikelihoodType = LIKELIHOOD_TYPE.NEGITIVE_LOG; - // default the bestGenotype likelihood to the allele AA + // default the bestGenotype likelihoods to the allele AA protected GENOTYPE bestGenotype = GENOTYPE.AA; // how many genotypes we're storing public static final int genoTypeCount = GENOTYPE.values().length; // the associated negitive log likelihoods for each genotype - protected final HashMap likelihood = new HashMap(); + protected final HashMap likelihoods = new HashMap(); /** create a blank likelihood object */ public LikelihoodObject() { for (GENOTYPE type : GENOTYPE.values()) { - likelihood.put(type, Double.MAX_VALUE); + likelihoods.put(type, Double.MAX_VALUE); } } @@ -82,15 +91,18 @@ public class LikelihoodObject { * * @param lk the likelihood object */ - public LikelihoodObject( GenotypeLikelihoods lk ) { + public LikelihoodObject(GenotypeLikelihoods lk) { + mLikelihoodType = LIKELIHOOD_TYPE.LOG; Double minValue = Double.MAX_VALUE; for (GENOTYPE type : GENOTYPE.values()) { byte[] bases = new byte[2]; bases[0] = (byte) type.toString().charAt(0); bases[1] = (byte) type.toString().charAt(1); double val = -1.0d * lk.getLikelihood(DiploidGenotype.fromBases(bases)); - likelihood.put(type, val); - if (val < minValue) { bestGenotype = type; } + likelihoods.put(type, val); + if (val < minValue) { + bestGenotype = type; + } } } @@ -98,18 +110,28 @@ public class LikelihoodObject { * create a likelyhood object, given an array of genotype scores in GLFv3 ordering * * @param values an array of int's from 0 to 255, representing the negitive log likelihoods. + * @param type the likelihood storage type */ - public LikelihoodObject( double[] values ) { + public LikelihoodObject(double[] values, LIKELIHOOD_TYPE type) { + mLikelihoodType = type; if (values.length != GENOTYPE.values().length) { throw new IllegalArgumentException("invalid array passed to LikelihoodObject, should be size " + GENOTYPE.values().length); } + findBestLikelihood(values); + } + + /** + * find the best likelihood + * @param values + */ + private void findBestLikelihood(double[] values) { int index = 0; double lowestScore = Double.MAX_VALUE; - for (GENOTYPE type : GENOTYPE.values()) { - likelihood.put(type, values[index]); + for (GENOTYPE t : GENOTYPE.values()) { + likelihoods.put(t, values[index]); if (values[index] < lowestScore) { lowestScore = values[index]; - bestGenotype = type; + bestGenotype = t; } ++index; } @@ -119,11 +141,11 @@ public class LikelihoodObject { * set the likelihood, given it's probability and the genotype * * @param type the genotype - * @param lh the likelihood as a double between 0 and 1, which is converted to a byte + * @param lh the likelihood as a double */ - public void setLikelihood( GENOTYPE type, double lh ) { - likelihood.put(type, lh); - if (lh < likelihood.get(this.bestGenotype)) { + public void setLikelihood(GENOTYPE type, double lh) { + likelihoods.put(type, lh); + if (lh < likelihoods.get(this.bestGenotype)) { this.bestGenotype = type; } } @@ -135,7 +157,7 @@ public class LikelihoodObject { * @return the min value */ public double getBestLikelihood() { - return likelihood.get(this.bestGenotype); + return likelihoods.get(this.bestGenotype); } /** @@ -149,7 +171,7 @@ public class LikelihoodObject { short ret[] = new short[GENOTYPE.values().length]; int index = 0; for (GENOTYPE type : GENOTYPE.values()) { - ret[index] = ( likelihood.get(type).intValue() > 254 ) ? 255 : (short) likelihood.get(type).intValue(); + ret[index] = (likelihoods.get(type).intValue() > 254) ? 255 : (short) likelihoods.get(type).intValue(); ++index; } return ret; @@ -166,7 +188,8 @@ public class LikelihoodObject { double[] ft = new double[10]; int index = 0; for (GENOTYPE T : GENOTYPE.values()) { - ft[index] = this.likelihood.get(T).floatValue(); + ft[index] = this.likelihoods.get(T).doubleValue(); + index++; } return ft; } @@ -177,16 +200,16 @@ public class LikelihoodObject { * * @return a GenotypeLikelihoods object representing our data */ - public GenotypeLikelihoods convert( SAMFileHeader samHeader, int seqIndex, int seqPosition, byte refBase ) { + public GenotypeLikelihoods convert(SAMFileHeader samHeader, int seqIndex, int seqPosition, byte refBase) { double[] ft = toDoubleArray(); float[] db = new float[ft.length]; int index = 0; if (this.mLikelihoodType == LIKELIHOOD_TYPE.NEGITIVE_LOG) { - for (;index < ft.length; index++) { - db[index] = ((float)ft[index] * -1.0f); + for (; index < ft.length; index++) { + db[index] = ((float) ft[index] * -1.0f); } } else if (this.mLikelihoodType == LIKELIHOOD_TYPE.RAW) { - for (;index < ft.length; index++) { + for (; index < ft.length; index++) { db[index] = (float) Math.log(ft[index]); } } @@ -195,18 +218,45 @@ public class LikelihoodObject { /** * getter for the likelihood type + * * @return our likelihood storage type */ - public LIKELIHOOD_TYPE getLikelihoodType() { + public LIKELIHOOD_TYPE getLikelihoodType() { return mLikelihoodType; } + + /** + * validate a genotype score + * + * @param score the score to validate + */ + public void validateScore(double score) { + int x = 0; + switch (mLikelihoodType) { + case NEGITIVE_LOG: + if (score < 0) + throw new StingException("Likelikhood score of " + score + " is invalid, for NEGITIVE_LOG it must be greater than or equal to 0"); + break; + case LOG: + if (score > 0) + throw new StingException("Likelikhood score of " + score + " is invalid, for LOG it must be less than or equal to 0"); + break; + case RAW: + if (score < 0 || score > 1) + throw new StingException("Likelikhood score of " + score + " is invalid, for RAW it must be [0,1]"); + break; + } + } + + /** * set our likelihood storage type, and adjust our current likelihood values to reflect - * the new setting. + * the new setting. + * * @param likelihood the type to set the values to. */ - public void setLikelihoodType( LIKELIHOOD_TYPE likelihood ) { + public void setLikelihoodType(LIKELIHOOD_TYPE likelihood) { if (likelihood == mLikelihoodType) return; if (mLikelihoodType == LIKELIHOOD_TYPE.RAW) { @@ -214,25 +264,27 @@ public class LikelihoodObject { if (likelihood == LIKELIHOOD_TYPE.NEGITIVE_LOG) { mult = -1.0; } - for (Double d : this.likelihood.values()) { - d = mult * Math.log(d); + // one of us in log, the other negitive log, it doesn't matter which + for (GENOTYPE g : likelihoods.keySet()) { + likelihoods.put(g, -1.0 * Math.log(likelihoods.get(g))); } - } - else if (likelihood == LIKELIHOOD_TYPE.RAW) { + } else if (likelihood == LIKELIHOOD_TYPE.RAW) { double mult = 1.0; if (mLikelihoodType == LIKELIHOOD_TYPE.NEGITIVE_LOG) { mult = -1.0; } - for (Double d : this.likelihood.values()) { - d = Math.pow(d*mult,10); - } - } - else { // one of us in log, the other negitive log, it doesn't matter which - for (Double d : this.likelihood.values()) { - d = -1.0 * Math.log(d); + for (GENOTYPE g : likelihoods.keySet()) { + likelihoods.put(g, Math.pow(likelihoods.get(g) * mult, 10)); + } + } else { + // one of us in log, the other negitive log, it doesn't matter which + for (GENOTYPE g : likelihoods.keySet()) { + likelihoods.put(g, -1.0 * likelihoods.get(g)); } } this.mLikelihoodType = likelihood; } } + + diff --git a/java/src/org/broadinstitute/sting/utils/genotype/TabularLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/TabularLFWriter.java new file mode 100644 index 000000000..83b7c9d4c --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/TabularLFWriter.java @@ -0,0 +1,99 @@ +package org.broadinstitute.sting.utils.genotype; + +import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.utils.StingException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; + + +/** + * + * @author aaron + * + * Class TabularLF + * + * the tabular likelihood format, as an implementation of the genotype interface + */ +public class TabularLFWriter implements GenotypeWriter { + /** + * where to print the tabular genotype likelihood info to + */ + public PrintStream outStream; + + /** + * construct, writing to a specified file + * @param writeTo + */ + public TabularLFWriter(File writeTo) { + try { + outStream = new PrintStream(writeTo); + } catch (FileNotFoundException e) { + throw new StingException("Unable to write to specified file: " + writeTo.getName()); + } + // print the header out + outStream.println("location sample_name ref alt genotype qhat qstar lodVsRef lodVsNextBest depth bases"); + } + + /** + * add a single point genotype call to the + * + * @param contig the contig you're calling in + * @param position the position on the contig + * @param referenceBase the reference base + * @param readDepth the read depth at the specified position + * @param likelihoods the likelihoods of each of the possible alleles + */ + @Override + public void addGenotypeCall(SAMSequenceRecord contig, int position, float rmsMapQuals, char referenceBase, int readDepth, LikelihoodObject likelihoods) { + /**return String.format("%s %s %c %c %s %f %f %f %f %d %s", + location, + contig.getSpecies(), + ref, + alt, + genotype(), + qhat, + qstar, + lodVsRef, + lodVsNextBest, + depth, + bases);*/ + } + + /** + * add a variable length call to the genotyper + * + * @param contig the contig you're calling in + * @param position the position on the genome + * @param rmsMapQuals the root mean square of the mapping qualities + * @param readDepth the read depth + * @param refBase the reference base + * @param firstHomZyg the first homozygous indel + * @param secondHomZyg the second homozygous indel (if present, null if not) + * @param hetLikelihood the heterozygous likelihood + */ + @Override + public void addVariableLengthCall(SAMSequenceRecord contig, int position, float rmsMapQuals, int readDepth, char refBase, IndelLikelihood firstHomZyg, IndelLikelihood secondHomZyg, byte hetLikelihood) { + throw new StingException("TabularLFWriter doesn't support variable length calls"); + } + + /** + * add a no call to the genotype file, if supported. + * + * @param position + * @param readDepth + */ + @Override + public void addNoCall(int position, int readDepth) { + throw new StingException("TabularLFWriter doesn't support no-calls"); + } + + /** finish writing, closing any open files. */ + @Override + public void close() { + if (this.outStream != null) { + outStream.close(); + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/gff/GFFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/gff/GFFWriter.java new file mode 100644 index 000000000..d3756a530 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/genotype/gff/GFFWriter.java @@ -0,0 +1,66 @@ +package org.broadinstitute.sting.utils.genotype.gff; + +import org.broadinstitute.sting.utils.genotype.GenotypeWriter; +import org.broadinstitute.sting.utils.genotype.LikelihoodObject; +import org.broadinstitute.sting.utils.genotype.IndelLikelihood; +import net.sf.samtools.SAMSequenceRecord; + + +/** + * @author aaron + *

+ * Class GFFWriter + *

+ * A descriptions should go here. Blame aaron if it's missing. + */ +public class GFFWriter implements GenotypeWriter { + + /** + * add a single point genotype call to the file + * + * @param contig the contig you're calling in + * @param position the position on the contig + * @param referenceBase the reference base + * @param readDepth the read depth at the specified position + * @param likelihoods the likelihoods of each of the possible alleles + */ + @Override + public void addGenotypeCall(SAMSequenceRecord contig, int position, float rmsMapQuals, char referenceBase, int readDepth, LikelihoodObject likelihoods) { + //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * add a variable length call to the genotype file + * + * @param contig the contig you're calling in + * @param position the position on the genome + * @param rmsMapQuals the root mean square of the mapping qualities + * @param readDepth the read depth + * @param refBase the reference base + * @param firstHomZyg the first homozygous indel + * @param secondHomZyg the second homozygous indel (if present, null if not) + * @param hetLikelihood the heterozygous likelihood + */ + @Override + public void addVariableLengthCall(SAMSequenceRecord contig, int position, float rmsMapQuals, int readDepth, char refBase, IndelLikelihood firstHomZyg, IndelLikelihood secondHomZyg, byte hetLikelihood) { + //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * add a no call to the genotype file, if supported. + * + * @param position the position + * @param readDepth the read depth + */ + @Override + public void addNoCall(int position, int readDepth) { + //To change body of implemented methods use File | Settings | File Templates. + } + + + /** finish writing, closing any open files. */ + @Override + public void close() { + //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java index e17d31eb3..d6448fef5 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFRecord.java @@ -194,7 +194,8 @@ abstract class GLFRecord { short bite = ((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f))); out.writeUByte((short) (this.getRecordType().getReadTypeValue() << 4 | (refBase.getBaseHexValue() & 0x0f))); out.writeUInt(((Long) offset).intValue()); - out.writeUInt((new Long(readDepth).intValue())); + int write = ((new Long(readDepth).intValue()) | this.minimumLikelihood << 24); + out.writeUInt(write); out.writeUByte((short) rmsMapQ); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java index 3efc3ba90..cbab0c727 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/GLFWriter.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.utils.genotype.glf; import net.sf.samtools.util.BinaryCodec; import net.sf.samtools.util.BlockCompressedOutputStream; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.utils.genotype.GenotypeWriter; import org.broadinstitute.sting.utils.genotype.IndelLikelihood; import org.broadinstitute.sting.utils.genotype.LikelihoodObject; @@ -53,6 +54,9 @@ public class GLFWriter implements GenotypeWriter { private String referenceSequenceName = null; private long referenceSequenceLength = 0; + // the last position written + private int lastPos = 0; + /** * The public constructor for creating a GLF object * @@ -69,17 +73,15 @@ public class GLFWriter implements GenotypeWriter { /** * add a point genotype to the GLF writer * - * @param contigName the name of the contig you're calling in - * @param contigLength the contig length + * @param contig the name of the contig you're calling in * @param refBase the reference base, as a char - * @param genomicLoc the location, as an offset from the previous glf record + * @param genomicLoc the location the location on the reference contig * @param readDepth the read depth at the specified postion * @param rmsMapQ the root mean square of the mapping quality * @param lhValues the GenotypeLikelihoods object, representing the genotype likelyhoods */ @Override - public void addGenotypeCall(String contigName, - int contigLength, + public void addGenotypeCall(SAMSequenceRecord contig, int genomicLoc, float rmsMapQ, char refBase, @@ -87,23 +89,23 @@ public class GLFWriter implements GenotypeWriter { LikelihoodObject lhValues) { // check if we've jumped to a new contig - checkSequence(contigName, contigLength); + checkSequence(contig.getSequenceName(), contig.getSequenceLength()); SinglePointCall call = new SinglePointCall(refBase, - genomicLoc, + genomicLoc - lastPos, readDepth, (short) rmsMapQ, lhValues.toDoubleArray()); + lastPos = genomicLoc; call.write(this.outputBinaryCodec); } /** * add a variable length (indel, deletion, etc) to the genotype writer * - * @param contigName the name of the contig you're calling in - * @param contigLength the contig length + * @param contig the name of the contig you're calling in * @param refBase the reference base - * @param genomicLoc the location, as an offset from the previous glf record + * @param genomicLoc the location on the reference contig * @param readDepth the read depth at the specified postion * @param rmsMapQ the root mean square of the mapping quality * @param firstHomZyg the first homozygous call @@ -111,8 +113,7 @@ public class GLFWriter implements GenotypeWriter { * @param hetLikelihood the negitive log likelihood of the heterozygote, from 0 to 255 */ @Override - public void addVariableLengthCall(String contigName, - int contigLength, + public void addVariableLengthCall(SAMSequenceRecord contig, int genomicLoc, float rmsMapQ, int readDepth, @@ -122,11 +123,11 @@ public class GLFWriter implements GenotypeWriter { byte hetLikelihood) { // check if we've jumped to a new contig - checkSequence(contigName, contigLength); - + checkSequence(contig.getSequenceName(), contig.getSequenceLength()); + // normalize the two VariableLengthCall call = new VariableLengthCall(refBase, - genomicLoc, + genomicLoc - lastPos, readDepth, (short) rmsMapQ, firstHomZyg.getLikelihood(), @@ -136,7 +137,7 @@ public class GLFWriter implements GenotypeWriter { firstHomZyg.getIndelSequence(), secondHomZyg.getLengthOfIndel(), secondHomZyg.getIndelSequence()); - + lastPos = genomicLoc; call.write(this.outputBinaryCodec); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java b/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java index 2574e2e1e..a8fb075c5 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/glf/SinglePointCall.java @@ -37,7 +37,7 @@ import net.sf.samtools.util.BinaryCodec; */ class SinglePointCall extends GLFRecord { - // our likelihood object + // our likelihoods object private double likelihoods[]; /** diff --git a/java/test/org/broadinstitute/sting/utils/genotype/GeliAdapterTest.java b/java/test/org/broadinstitute/sting/utils/genotype/GeliAdapterTest.java index d9a1b4d29..ccd4953d7 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/GeliAdapterTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/GeliAdapterTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.genotype; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.junit.Test; @@ -55,8 +56,9 @@ public class GeliAdapterTest extends BaseTest { File fl = new File("testFile.txt"); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(2,1,10); adapter = new GeliAdapter(fl,header); - LikelihoodObject obj = new LikelihoodObject(createFakeLikelihoods()); - adapter.addGenotypeCall("chr1",10,100,100,'A',100,obj); + LikelihoodObject obj = new LikelihoodObject(createFakeLikelihoods(), LikelihoodObject.LIKELIHOOD_TYPE.LOG); + SAMSequenceRecord rec = new SAMSequenceRecord("chr1",10); + adapter.addGenotypeCall(rec,100,100,'A',100,obj); adapter.close(); } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/LikelihoodObjectTest.java b/java/test/org/broadinstitute/sting/utils/genotype/LikelihoodObjectTest.java index 644a8a559..97ae2fcff 100755 --- a/java/test/org/broadinstitute/sting/utils/genotype/LikelihoodObjectTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/LikelihoodObjectTest.java @@ -52,7 +52,7 @@ public class LikelihoodObjectTest extends BaseTest { @Test public void testBlankConstruction() { mLO = new LikelihoodObject(); - assertTrue(mLO.likelihood.size() == LikelihoodObject.GENOTYPE.values().length); + assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); } @Test @@ -61,12 +61,12 @@ public class LikelihoodObjectTest extends BaseTest { for (int x = 0; x < 10; x++) { ray[x] = ( x * 25 ); } - mLO = new LikelihoodObject(ray); - assertTrue(mLO.likelihood.size() == LikelihoodObject.GENOTYPE.values().length); + mLO = new LikelihoodObject(ray,LikelihoodObject.LIKELIHOOD_TYPE.NEGITIVE_LOG); + assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); int index = 0; for (LikelihoodObject.GENOTYPE t : LikelihoodObject.GENOTYPE.values()) { - assertTrue(ray[index] == mLO.likelihood.get(t)); + assertTrue(ray[index] == mLO.likelihoods.get(t)); ++index; } } @@ -77,8 +77,8 @@ public class LikelihoodObjectTest extends BaseTest { for (int x = 0; x < 10; x++) { ray[x] = ( x * 25.0 ); } - mLO = new LikelihoodObject(ray); - assertTrue(mLO.likelihood.size() == LikelihoodObject.GENOTYPE.values().length); + mLO = new LikelihoodObject(ray,LikelihoodObject.LIKELIHOOD_TYPE.NEGITIVE_LOG); + assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); int index = 0; short[] ret = mLO.toByteArray(); @@ -104,8 +104,8 @@ public class LikelihoodObjectTest extends BaseTest { ray[x] = ( 240.0 ); } ray [5] = 0; - mLO = new LikelihoodObject(ray); - assertTrue(mLO.likelihood.size() == LikelihoodObject.GENOTYPE.values().length); + mLO = new LikelihoodObject(ray, LikelihoodObject.LIKELIHOOD_TYPE.NEGITIVE_LOG); + assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); short smallest = (short)mLO.getBestLikelihood(); assertTrue(smallest == 0); int index = 0; @@ -122,7 +122,7 @@ public class LikelihoodObjectTest extends BaseTest { for (LikelihoodObject.GENOTYPE t : LikelihoodObject.GENOTYPE.values()) { mLO.setLikelihood(t,128); } - assertTrue(mLO.likelihood.size() == LikelihoodObject.GENOTYPE.values().length); + assertTrue(mLO.likelihoods.size() == LikelihoodObject.GENOTYPE.values().length); int index = 0; short[] ret = mLO.toByteArray();