Refactoring, cleanup, and performance improvements to ProduceBeagleInput. It's really a shame that there's no integration tests...
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5418 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
097a9a59e8
commit
ccc773d175
|
|
@ -60,8 +60,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
@Output(doc="File to which BEAGLE input should be written",required=true)
|
@Output(doc="File to which BEAGLE input should be written",required=true)
|
||||||
protected PrintStream beagleWriter = null;
|
protected PrintStream beagleWriter = null;
|
||||||
|
|
||||||
@Argument(fullName = "genotypes_file", shortName = "genotypes", doc = "File to print reference genotypes for error analysis", required = false)
|
|
||||||
public PrintStream beagleGenotypesWriter = null;
|
|
||||||
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
|
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
|
||||||
public double insertedNoCallRate = 0;
|
public double insertedNoCallRate = 0;
|
||||||
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
|
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
|
||||||
|
|
@ -81,7 +79,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
private Set<String> samples = null;
|
private Set<String> samples = null;
|
||||||
private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
|
private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
|
||||||
private Random generator;
|
private Random generator;
|
||||||
private int bootstrapCount = -1;
|
|
||||||
private int bootstrapSetSize = 0;
|
private int bootstrapSetSize = 0;
|
||||||
private int testSetSize = 0;
|
private int testSetSize = 0;
|
||||||
|
|
||||||
|
|
@ -95,8 +92,6 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
|
beagleWriter.print(String.format(" %s %s %s", sample, sample, sample));
|
||||||
|
|
||||||
beagleWriter.println();
|
beagleWriter.println();
|
||||||
if (beagleGenotypesWriter != null)
|
|
||||||
beagleGenotypesWriter.println("dummy header");
|
|
||||||
|
|
||||||
if ( bootstrapVCFOutput != null ) {
|
if ( bootstrapVCFOutput != null ) {
|
||||||
initializeVcfWriter();
|
initializeVcfWriter();
|
||||||
|
|
@ -137,7 +132,7 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean goodSite(VariantContext v) {
|
public boolean goodSite(VariantContext v) {
|
||||||
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
|
return v != null && ! v.isFiltered() && v.isBiallelic() && v.hasGenotypes();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
|
public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
|
||||||
|
|
@ -165,13 +160,14 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private final static double[] HAPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.5, 0.0, 0.5 });
|
||||||
|
private final static double[] DIPLOID_FLAT_LOG10_LIKELIHOODS = MathUtils.toLog10(new double[]{ 0.33, 0.33, 0.33 });
|
||||||
|
|
||||||
public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
|
public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
|
||||||
GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC);
|
GenomeLoc currentLoc = VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC);
|
||||||
beagleWriter.print(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart()));
|
StringBuffer beagleOut = new StringBuffer();
|
||||||
if ( beagleGenotypesWriter != null ) {
|
|
||||||
beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(getToolkit().getGenomeLocParser(),preferredVC).toString()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
beagleOut.append(String.format("%s:%d ",currentLoc.getContig(),currentLoc.getStart()));
|
||||||
for ( Allele allele : preferredVC.getAlleles() ) {
|
for ( Allele allele : preferredVC.getAlleles() ) {
|
||||||
String bglPrintString;
|
String bglPrintString;
|
||||||
if (allele.isNoCall() || allele.isNull())
|
if (allele.isNoCall() || allele.isNull())
|
||||||
|
|
@ -179,18 +175,16 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
else
|
else
|
||||||
bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele
|
bglPrintString = allele.getBaseString(); // get rid of * in case of reference allele
|
||||||
|
|
||||||
beagleWriter.print(String.format("%s ", bglPrintString));
|
beagleOut.append(String.format("%s ", bglPrintString));
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
|
Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
|
||||||
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
|
||||||
boolean isValidation;
|
|
||||||
for ( String sample : samples ) {
|
for ( String sample : samples ) {
|
||||||
boolean isMaleOnChrX = false;
|
boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale();
|
||||||
if( CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale() ) {
|
|
||||||
isMaleOnChrX = true;
|
|
||||||
}
|
|
||||||
Genotype genotype;
|
Genotype genotype;
|
||||||
|
boolean isValidation;
|
||||||
// use sample as key into genotypes structure
|
// use sample as key into genotypes structure
|
||||||
if ( preferredGenotypes.keySet().contains(sample) ) {
|
if ( preferredGenotypes.keySet().contains(sample) ) {
|
||||||
genotype = preferredGenotypes.get(sample);
|
genotype = preferredGenotypes.get(sample);
|
||||||
|
|
@ -202,36 +196,22 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
// there is magically no genotype for this sample.
|
// there is magically no genotype for this sample.
|
||||||
throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen.");
|
throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen.");
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
|
/*
|
||||||
* Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
|
* Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
|
||||||
*/
|
*/
|
||||||
|
double [] log10Likelihoods = null;
|
||||||
if ( (isValidation && prior < 0.0) || genotype.hasLikelihoods() ) {
|
if ( (isValidation && prior < 0.0) || genotype.hasLikelihoods() ) {
|
||||||
double[] likeArray = genotype.getLikelihoods().getAsVector();
|
log10Likelihoods = genotype.getLikelihoods().getAsVector();
|
||||||
if( isMaleOnChrX ) {
|
|
||||||
likeArray[1] = -255;
|
|
||||||
}
|
|
||||||
double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
|
|
||||||
// see if we need to randomly mask out genotype in this position.
|
// see if we need to randomly mask out genotype in this position.
|
||||||
Double d = generator.nextDouble();
|
if ( generator.nextDouble() <= insertedNoCallRate ) {
|
||||||
if (d > insertedNoCallRate ) {
|
|
||||||
// System.out.format("%5.4f ", d);
|
|
||||||
for (Double likeVal: normalizedLikelihoods)
|
|
||||||
beagleWriter.print(String.format("%5.4f ",likeVal));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
// we are masking out this genotype
|
// we are masking out this genotype
|
||||||
if( isMaleOnChrX ) {
|
log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
|
||||||
beagleWriter.print("0.5 0.0 0.5 ");
|
|
||||||
} else {
|
|
||||||
beagleWriter.print("0.33 0.33 0.33 ");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( beagleGenotypesWriter != null && genotype.isCalled() ) {
|
if( isMaleOnChrX ) {
|
||||||
char a = genotype.getAllele(0).toString().charAt(0);
|
log10Likelihoods[1] = -255; // todo -- warning this is dangerous for multi-allele case
|
||||||
char b = genotype.getAllele(0).toString().charAt(0);
|
|
||||||
|
|
||||||
beagleGenotypesWriter.format("%c %c ", a, b);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
|
|
@ -248,33 +228,28 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
|
||||||
else if (genotype.isHet()) { AB = prior; }
|
else if (genotype.isHet()) { AB = prior; }
|
||||||
else if (genotype.isHomVar()) { BB = prior; }
|
else if (genotype.isHomVar()) { BB = prior; }
|
||||||
|
|
||||||
if( isMaleOnChrX ) {
|
log10Likelihoods = MathUtils.toLog10(new double[]{ AA, isMaleOnChrX ? 0.0 : AB, BB });
|
||||||
beagleWriter.printf("%.2f %.2f %.2f ", AA, 0.0, BB);
|
|
||||||
} else {
|
|
||||||
beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (beagleGenotypesWriter != null) {
|
|
||||||
char a = genotype.getAllele(0).toString().charAt(0);
|
|
||||||
char b = genotype.getAllele(0).toString().charAt(0);
|
|
||||||
|
|
||||||
beagleGenotypesWriter.format("%c %c ", a, b);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if( isMaleOnChrX ) {
|
log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS;
|
||||||
beagleWriter.print("0.5 0.0 0.5 ");
|
|
||||||
} else {
|
|
||||||
beagleWriter.print("0.33 0.33 0.33 ");
|
|
||||||
} // write 1/3 likelihoods for uncalled genotypes.
|
|
||||||
if (beagleGenotypesWriter != null)
|
|
||||||
beagleGenotypesWriter.print(". . ");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
writeSampleLikelihoods(beagleOut, log10Likelihoods);
|
||||||
}
|
}
|
||||||
|
|
||||||
beagleWriter.println();
|
beagleWriter.println(beagleOut.toString());
|
||||||
if (beagleGenotypesWriter != null)
|
}
|
||||||
beagleGenotypesWriter.println();
|
|
||||||
|
private void writeSampleLikelihoods( StringBuffer out, double[] log10Likelihoods ) {
|
||||||
|
double[] normalizedLog10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods);
|
||||||
|
// see if we need to randomly mask out genotype in this position.
|
||||||
|
// todo -- remove me after testing
|
||||||
|
if ( log10Likelihoods == HAPLOID_FLAT_LOG10_LIKELIHOODS || log10Likelihoods == DIPLOID_FLAT_LOG10_LIKELIHOODS )
|
||||||
|
for (double likeVal: normalizedLog10Likelihoods)
|
||||||
|
out.append(String.format("%.2f ",likeVal));
|
||||||
|
else
|
||||||
|
for (double likeVal: normalizedLog10Likelihoods)
|
||||||
|
out.append(String.format("%5.4f ",likeVal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -82,6 +82,19 @@ public class MathUtils {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts a real space array of probabilities into a log10 array
|
||||||
|
* @param prRealSpace
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public static double[] toLog10(double[] prRealSpace) {
|
||||||
|
double[] log10s = new double[prRealSpace.length];
|
||||||
|
for ( int i = 0; i < prRealSpace.length; i++ )
|
||||||
|
log10s[i] = Math.log10(prRealSpace[i]);
|
||||||
|
return log10s;
|
||||||
|
}
|
||||||
|
|
||||||
public static double log10sumLog10(double[] log10p, int start) {
|
public static double log10sumLog10(double[] log10p, int start) {
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue