From ccc773d175f73772acd5f8943eb106d08c75eb66 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 11 Mar 2011 13:55:30 +0000 Subject: [PATCH] Refactoring, cleanup, and performance improvements to ProduceBeagleInput. It's really a shame that there's no integration tests... git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5418 348d0f76-0448-11de-a6fe-93d51630548a --- .../beagle/ProduceBeagleInputWalker.java | 97 +++++++------------ .../broadinstitute/sting/utils/MathUtils.java | 13 +++ 2 files changed, 49 insertions(+), 61 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 968bd7131..4c4b34479 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -60,8 +60,6 @@ public class ProduceBeagleInputWalker extends RodWalker { @Output(doc="File to which BEAGLE input should be written",required=true) protected PrintStream beagleWriter = null; - @Argument(fullName = "genotypes_file", shortName = "genotypes", doc = "File to print reference genotypes for error analysis", required = false) - public PrintStream beagleGenotypesWriter = null; @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) @@ -81,7 +79,6 @@ public class ProduceBeagleInputWalker extends RodWalker { private Set samples = null; private Set BOOTSTRAP_FILTER = new HashSet( Arrays.asList("bootstrap") ); private Random generator; - private int bootstrapCount = -1; private int bootstrapSetSize = 0; private int testSetSize = 0; @@ -95,8 +92,6 @@ public class ProduceBeagleInputWalker extends RodWalker { beagleWriter.print(String.format(" %s %s %s", sample, sample, sample)); beagleWriter.println(); - if (beagleGenotypesWriter != null) - beagleGenotypesWriter.println("dummy header"); if ( bootstrapVCFOutput != null ) { initializeVcfWriter(); @@ -137,7 +132,7 @@ public class ProduceBeagleInputWalker extends RodWalker { } public boolean goodSite(VariantContext v) { - return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); + return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); } public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) { @@ -165,13 +160,14 @@ public class ProduceBeagleInputWalker extends RodWalker { } } + private final static double[] HAPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.5, 0.0, 0.5 }); + private final static double[] DIPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.33, 0.33, 0.33 }); + public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC); - beagleWriter.print(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart())); - if ( beagleGenotypesWriter != null ) { - beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC).toString())); - } + StringBuffer beagleOut = new StringBuffer(); + beagleOut.append(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart())); for ( Allele allele : preferredVC.getAlleles() ) { String bglPrintString; if (allele.isNoCall() || allele.isNull()) @@ -179,18 +175,16 @@ public class ProduceBeagleInputWalker extends RodWalker { else bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele - beagleWriter.print(String.format("%s ", bglPrintString)); + beagleOut.append(String.format("%s ", bglPrintString)); } Map preferredGenotypes = preferredVC.getGenotypes(); Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; - boolean isValidation; for ( String sample : samples ) { - boolean isMaleOnChrX = false; - if( CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale() ) { - isMaleOnChrX = true; - } + boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale(); + Genotype genotype; + boolean isValidation; // use sample as key into genotypes structure if ( preferredGenotypes.keySet().contains(sample) ) { genotype = preferredGenotypes.get(sample); @@ -202,36 +196,22 @@ public class ProduceBeagleInputWalker extends RodWalker { // there is magically no genotype for this sample. throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen."); } - /** + + /* * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key */ + double [] log10Likelihoods = null; if ( (isValidation && prior < 0.0) || genotype.hasLikelihoods() ) { - double[] likeArray = genotype.getLikelihoods().getAsVector(); - if( isMaleOnChrX ) { - likeArray[1] = -255; - } - double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray); + log10Likelihoods = genotype.getLikelihoods().getAsVector(); + // see if we need to randomly mask out genotype in this position. - Double d = generator.nextDouble(); - if (d > insertedNoCallRate ) { -// System.out.format("%5.4f ", d); - for (Double likeVal: normalizedLikelihoods) - beagleWriter.print(String.format("%5.4f ",likeVal)); - } - else { + if ( generator.nextDouble() <= insertedNoCallRate ) { // we are masking out this genotype - if( isMaleOnChrX ) { - beagleWriter.print("0.5 0.0 0.5 "); - } else { - beagleWriter.print("0.33 0.33 0.33 "); - } + log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } - if ( beagleGenotypesWriter != null && genotype.isCalled() ) { - char a = genotype.getAllele(0).toString().charAt(0); - char b = genotype.getAllele(0).toString().charAt(0); - - beagleGenotypesWriter.format("%c %c ", a, b); + if( isMaleOnChrX ) { + log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case } } /** @@ -248,33 +228,28 @@ public class ProduceBeagleInputWalker extends RodWalker { else if (genotype.isHet()) { AB = prior; } else if (genotype.isHomVar()) { BB = prior; } - if( isMaleOnChrX ) { - beagleWriter.printf("%.2f %.2f %.2f ", AA, 0.0, BB); - } else { - beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB); - } - - if (beagleGenotypesWriter != null) { - char a = genotype.getAllele(0).toString().charAt(0); - char b = genotype.getAllele(0).toString().charAt(0); - - beagleGenotypesWriter.format("%c %c ", a, b); - } + log10Likelihoods = MathUtils.toLog10(new double[]{ AA, isMaleOnChrX ? 0.0 : AB, BB }); } else { - if( isMaleOnChrX ) { - beagleWriter.print("0.5 0.0 0.5 "); - } else { - beagleWriter.print("0.33 0.33 0.33 "); - } // write 1/3 likelihoods for uncalled genotypes. - if (beagleGenotypesWriter != null) - beagleGenotypesWriter.print(". . "); + log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } + + writeSampleLikelihoods(beagleOut, log10Likelihoods); } - beagleWriter.println(); - if (beagleGenotypesWriter != null) - beagleGenotypesWriter.println(); + beagleWriter.println(beagleOut.toString()); + } + + private void writeSampleLikelihoods( StringBuffer out, double[] log10Likelihoods ) { + double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods); + // see if we need to randomly mask out genotype in this position. + // todo -- remove me after testing + if ( log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS ) + for (double likeVal: normalizedLog10Likelihoods) + out.append(String.format("%.2f ",likeVal)); + else + for (double likeVal: normalizedLog10Likelihoods) + out.append(String.format("%5.4f ",likeVal)); } diff --git a/java/src/org/broadinstitute/sting/utils/MathUtils.java b/java/src/org/broadinstitute/sting/utils/MathUtils.java index afdf151d6..cec5c5b0f 100755 --- a/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -82,6 +82,19 @@ public class MathUtils { return s; } + + /** + * Converts a real space array of probabilities into a log10 array + * @param prRealSpace + * @return + */ + public static double[] toLog10(double[] prRealSpace) { + double[] log10s = new double[prRealSpace.length]; + for ( int i = 0; i < prRealSpace.length; i++ ) + log10s[i] = Math.log10(prRealSpace[i]); + return log10s; + } + public static double log10sumLog10(double[] log10p, int start) { double sum = 0.0;