Refactoring, cleanup, and performance improvements to ProduceBeagleInput. It's really a shame that there's no integration tests...

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5418 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-03-11 13:55:30 +00:00
parent 097a9a59e8
commit ccc773d175
2 changed files with 49 additions and 61 deletions

View File

@ -60,8 +60,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Output(doc="File to which BEAGLE input should be written",required=true) @Output(doc="File to which BEAGLE input should be written",required=true)
protected PrintStream beagleWriter = null; protected PrintStream beagleWriter = null;
@Argument(fullName = "genotypes_file", shortName = "genotypes", doc = "File to print reference genotypes for error analysis", required = false)
public PrintStream beagleGenotypesWriter = null;
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0; public double insertedNoCallRate = 0;
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
@ -81,7 +79,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
private Set<String> samples = null; private Set<String> samples = null;
private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") ); private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
private Random generator; private Random generator;
private int bootstrapCount = -1;
private int bootstrapSetSize = 0; private int bootstrapSetSize = 0;
private int testSetSize = 0; private int testSetSize = 0;
@ -95,8 +92,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample)); beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
beagleWriter.println(); beagleWriter.println();
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.println("dummy header");
if ( bootstrapVCFOutput != null ) { if ( bootstrapVCFOutput != null ) {
initializeVcfWriter(); initializeVcfWriter();
@ -137,7 +132,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
} }
public boolean goodSite(VariantContext v) { public boolean goodSite(VariantContext v) {
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes(); return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
} }
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) { public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
@ -165,13 +160,14 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
} }
} }
private final static double[] HAPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.5, 0.0, 0.5 });
private final static double[] DIPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.33, 0.33, 0.33 });
public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC); GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC);
beagleWriter.print(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart())); StringBuffer beagleOut = new StringBuffer();
if ( beagleGenotypesWriter != null ) {
beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC).toString()));
}
beagleOut.append(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart()));
for ( Allele allele : preferredVC.getAlleles() ) { for ( Allele allele : preferredVC.getAlleles() ) {
String bglPrintString; String bglPrintString;
if (allele.isNoCall() || allele.isNull()) if (allele.isNoCall() || allele.isNull())
@ -179,18 +175,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
else else
bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele
beagleWriter.print(String.format("%s ", bglPrintString)); beagleOut.append(String.format("%s ", bglPrintString));
} }
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes(); Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
boolean isValidation;
for ( String sample : samples ) { for ( String sample : samples ) {
boolean isMaleOnChrX = false; boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale();
if( CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale() ) {
isMaleOnChrX = true;
}
Genotype genotype; Genotype genotype;
boolean isValidation;
// use sample as key into genotypes structure // use sample as key into genotypes structure
if ( preferredGenotypes.keySet().contains(sample) ) { if ( preferredGenotypes.keySet().contains(sample) ) {
genotype = preferredGenotypes.get(sample); genotype = preferredGenotypes.get(sample);
@ -202,36 +196,22 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
// there is magically no genotype for this sample. // there is magically no genotype for this sample.
throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen."); throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen.");
} }
/**
/*
* Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
*/ */
double [] log10Likelihoods = null;
if ( (isValidation && prior < 0.0) || genotype.hasLikelihoods() ) { if ( (isValidation && prior < 0.0) || genotype.hasLikelihoods() ) {
double[] likeArray = genotype.getLikelihoods().getAsVector(); log10Likelihoods = genotype.getLikelihoods().getAsVector();
if( isMaleOnChrX ) {
likeArray[1] = -255;
}
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
// see if we need to randomly mask out genotype in this position. // see if we need to randomly mask out genotype in this position.
Double d = generator.nextDouble(); if ( generator.nextDouble() <= insertedNoCallRate ) {
if (d > insertedNoCallRate ) {
// System.out.format("%5.4f ", d);
for (Double likeVal: normalizedLikelihoods)
beagleWriter.print(String.format("%5.4f ",likeVal));
}
else {
// we are masking out this genotype // we are masking out this genotype
if( isMaleOnChrX ) { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
beagleWriter.print("0.5 0.0 0.5 ");
} else {
beagleWriter.print("0.33 0.33 0.33 ");
}
} }
if ( beagleGenotypesWriter != null && genotype.isCalled() ) { if( isMaleOnChrX ) {
char a = genotype.getAllele(0).toString().charAt(0); log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case
char b = genotype.getAllele(0).toString().charAt(0);
beagleGenotypesWriter.format("%c %c ", a, b);
} }
} }
/** /**
@ -248,33 +228,28 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
else if (genotype.isHet()) { AB = prior; } else if (genotype.isHet()) { AB = prior; }
else if (genotype.isHomVar()) { BB = prior; } else if (genotype.isHomVar()) { BB = prior; }
if( isMaleOnChrX ) { log10Likelihoods = MathUtils.toLog10(new double[]{ AA, isMaleOnChrX ? 0.0 : AB, BB });
beagleWriter.printf("%.2f %.2f %.2f ", AA, 0.0, BB);
} else {
beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
}
if (beagleGenotypesWriter != null) {
char a = genotype.getAllele(0).toString().charAt(0);
char b = genotype.getAllele(0).toString().charAt(0);
beagleGenotypesWriter.format("%c %c ", a, b);
}
} }
else { else {
if( isMaleOnChrX ) { log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
beagleWriter.print("0.5 0.0 0.5 ");
} else {
beagleWriter.print("0.33 0.33 0.33 ");
} // write 1/3 likelihoods for uncalled genotypes.
if (beagleGenotypesWriter != null)
beagleGenotypesWriter.print(". . ");
} }
writeSampleLikelihoods(beagleOut, log10Likelihoods);
} }
beagleWriter.println(); beagleWriter.println(beagleOut.toString());
if (beagleGenotypesWriter != null) }
beagleGenotypesWriter.println();
private void writeSampleLikelihoods( StringBuffer out, double[] log10Likelihoods ) {
double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
// see if we need to randomly mask out genotype in this position.
// todo -- remove me after testing
if ( log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS )
for (double likeVal: normalizedLog10Likelihoods)
out.append(String.format("%.2f ",likeVal));
else
for (double likeVal: normalizedLog10Likelihoods)
out.append(String.format("%5.4f ",likeVal));
} }

View File

@ -82,6 +82,19 @@ public class MathUtils {
return s; return s;
} }
/**
* Converts a real space array of probabilities into a log10 array
* @param prRealSpace
* @return
*/
public static double[] toLog10(double[] prRealSpace) {
double[] log10s = new double[prRealSpace.length];
for ( int i = 0; i < prRealSpace.length; i++ )
log10s[i] = Math.log10(prRealSpace[i]);
return log10s;
}
public static double log10sumLog10(double[] log10p, int start) { public static double log10sumLog10(double[] log10p, int start) {
double sum = 0.0; double sum = 0.0;