Useful debug file output

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4672 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-11-15 13:36:52 +00:00
parent 28142408ff
commit d76b87d6e3
2 changed files with 34 additions and 3 deletions

View File

@ -467,10 +467,18 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
// Code to determine FDR tranches for VariantDatum[] // Code to determine FDR tranches for VariantDatum[]
// //
// --------------------------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------------------------
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] FDRtranches, double targetTiTv ) { public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] FDRtranches, double targetTiTv ) {
return findTranches( data, FDRtranches, targetTiTv, null );
}
public final static List<Tranche> findTranches( final VariantDatum[] data, final double[] FDRtranches, double targetTiTv, File debugFile ) {
logger.info(String.format("Finding tranches for %d variants with %d FDRs and a target TiTv of %.2f", data.length, FDRtranches.length, targetTiTv));
List<VariantDatum> tranchesData = sortVariantsbyQual(data); List<VariantDatum> tranchesData = sortVariantsbyQual(data);
double[] runningTiTv = calculateRunningTiTv(tranchesData); double[] runningTiTv = calculateRunningTiTv(tranchesData);
if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, tranchesData, runningTiTv); }
List<Tranche> tranches = new ArrayList<Tranche>(); List<Tranche> tranches = new ArrayList<Tranche>();
for ( double fdr : FDRtranches ) { for ( double fdr : FDRtranches ) {
Tranche t = findTranche(tranchesData, runningTiTv, fdr, targetTiTv); Tranche t = findTranche(tranchesData, runningTiTv, fdr, targetTiTv);
@ -488,6 +496,19 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
return tranches; return tranches;
} }
private static void writeTranchesDebuggingInfo(File f, List<VariantDatum> tranchesData, double[] runningTiTv ) {
try {
PrintStream out = new PrintStream(f);
out.println("Qual isTransition runningTiTv");
for ( int i = 0; i < runningTiTv.length; i++ ) {
VariantDatum d = tranchesData.get(i);
out.printf("%.4f %d %.4f%n", d.qual, d.isTransition ? 1 : 0, runningTiTv[i]);
}
} catch (FileNotFoundException e) {
throw new UserException.CouldNotCreateOutputFile(f, e);
}
}
private final static List<VariantDatum> sortVariantsbyQual(final VariantDatum[] data) { private final static List<VariantDatum> sortVariantsbyQual(final VariantDatum[] data) {
List<VariantDatum> sorted = new ArrayList<VariantDatum>(Arrays.asList(data)); List<VariantDatum> sorted = new ArrayList<VariantDatum>(Arrays.asList(data));
Collections.sort(sorted); Collections.sort(sorted);
@ -510,16 +531,24 @@ public final class VariantGaussianMixtureModel extends VariantOptimizationModel
} }
public final static Tranche findTranche( final List<VariantDatum> data, double[] runningTiTv, final double desiredFDR, double targetTiTv ) { public final static Tranche findTranche( final List<VariantDatum> data, double[] runningTiTv, final double desiredFDR, double targetTiTv ) {
if ( data.size() != runningTiTv.length ) throw new ReviewedStingException("BUG: data and running titv are of different sizes");
final double titvThreshold = fdrToTiTv(desiredFDR, targetTiTv); // compute the desired TiTv final double titvThreshold = fdrToTiTv(desiredFDR, targetTiTv); // compute the desired TiTv
for ( int i = 0; i < runningTiTv.length; i++ ) { for ( int i = 0; i < runningTiTv.length; i++ ) {
if ( runningTiTv[i] >= titvThreshold ) { if ( runningTiTv[i] >= titvThreshold ) {
// we've found the largest group of variants with Ti/Tv >= our target titv // we've found the largest group of variants with Ti/Tv >= our target titv
return trancheOfVariants(data, i, desiredFDR, targetTiTv); Tranche t = trancheOfVariants(data, i, desiredFDR, targetTiTv);
logger.info(String.format(" Found tranche for %.3f FDR = %.3f TiTv threshold starting with variant %d; running TiTv is %.2f ",
desiredFDR, titvThreshold, i, runningTiTv[i]));
logger.info(String.format(" Trance is %s", t));
return t;
} }
} }
// we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null // we get here when there's no subset of variants with Ti/Tv >= threshold, in which case we should return null
logger.info(String.format(" **Couldn't find tranche for %.3f FDR = %.3f TiTv threshold; last running TiTv was %.2f ",
desiredFDR, titvThreshold, runningTiTv[runningTiTv.length-1]));
return null; return null;
} }

View File

@ -118,6 +118,8 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
@Hidden @Hidden
@Argument(fullName="quality_scale_factor", shortName="qScaleFactor", doc="Multiply all final quality scores by this value. FOR DEBUGGING PURPOSES ONLY.", required=false) @Argument(fullName="quality_scale_factor", shortName="qScaleFactor", doc="Multiply all final quality scores by this value. FOR DEBUGGING PURPOSES ONLY.", required=false)
private double QUALITY_SCALE_FACTOR = 1.0; private double QUALITY_SCALE_FACTOR = 1.0;
@Argument(fullName = "debugFile", shortName = "debugFile", doc = "Print debugging information here", required=false)
private File DEBUG_FILE = null;
///////////////////////////// /////////////////////////////
@ -333,7 +335,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys ); final VariantDataManager dataManager = new VariantDataManager( reduceSum, theModel.dataManager.annotationKeys );
reduceSum.clear(); // Don't need this ever again, clean up some memory reduceSum.clear(); // Don't need this ever again, clean up some memory
List<Tranche> tranches = VariantGaussianMixtureModel.findTranches( dataManager.data, FDR_TRANCHES, TARGET_TITV ); List<Tranche> tranches = VariantGaussianMixtureModel.findTranches( dataManager.data, FDR_TRANCHES, TARGET_TITV, DEBUG_FILE );
tranchesStream.print(Tranche.tranchesString(tranches)); tranchesStream.print(Tranche.tranchesString(tranches));
// Execute Rscript command to plot the optimization curve // Execute Rscript command to plot the optimization curve